src/google/protobuf/util/internal/json_escaping.cc - platform/prebuilts/libprotobuf/linux - Git at Google

 // Protocol Buffers - Google's data interchange format
 // Copyright 2008 Google Inc.  All rights reserved.
 // https://developers.google.com/protocol-buffers/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <google/protobuf/util/internal/json_escaping.h>

 #include <cstdint>

 #include <google/protobuf/stubs/logging.h>
 #include <google/protobuf/stubs/common.h>

 namespace google {
 namespace protobuf {
 namespace util {
 namespace converter {

 namespace {

 // Array of hex characters for conversion to hex.
 static const char kHex[] = "0123456789abcdef";

 // Characters 0x00 to 0x9f are very commonly used, so we provide a special
 // table lookup.
 //
 // For unicode code point ch < 0xa0:
 // kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
 //                    or an empty string, if escaping is not needed.
 static const char kCommonEscapes[160][7] = {
     // C0 (ASCII and derivatives) control characters
     "\\u0000", "\\u0001", "\\u0002", "\\u0003",  // 0x00
     "\\u0004", "\\u0005", "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000b",
     "\\f", "\\r", "\\u000e", "\\u000f", "\\u0010", "\\u0011", "\\u0012",
     "\\u0013",  // 0x10
     "\\u0014", "\\u0015", "\\u0016", "\\u0017", "\\u0018", "\\u0019", "\\u001a",
     "\\u001b", "\\u001c", "\\u001d", "\\u001e", "\\u001f",
     // Escaping of " and \ are required by www.json.org string definition.
     // Escaping of < and > are required for HTML security.
     "", "", "\\\"", "", "", "", "", "",                              // 0x20
     "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",  // 0x30
     "", "", "", "", "\\u003c", "", "\\u003e", "", "", "", "", "", "", "", "",
     "",                                                                  // 0x40
     "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",      // 0x50
     "", "", "", "", "\\\\", "", "", "", "", "", "", "", "", "", "", "",  // 0x60
     "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",      // 0x70
     "", "", "", "", "", "", "", "\\u007f",
     // C1 (ISO 8859 and Unicode) extended control characters
     "\\u0080", "\\u0081", "\\u0082", "\\u0083",  // 0x80
     "\\u0084", "\\u0085", "\\u0086", "\\u0087", "\\u0088", "\\u0089", "\\u008a",
     "\\u008b", "\\u008c", "\\u008d", "\\u008e", "\\u008f", "\\u0090", "\\u0091",
     "\\u0092", "\\u0093",  // 0x90
     "\\u0094", "\\u0095", "\\u0096", "\\u0097", "\\u0098", "\\u0099", "\\u009a",
     "\\u009b", "\\u009c", "\\u009d", "\\u009e", "\\u009f"};

 // Determines if the given char value is a unicode surrogate code unit (either
 // high-surrogate or low-surrogate).
 inline bool IsSurrogate(uint32_t c) {
   // Optimized form of:
   // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
   // (Reduced from 3 ALU instructions to 2 ALU instructions)
   return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
 }

 // Returns true if the given unicode code point cp is a valid
 // unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
 inline bool IsValidCodePoint(uint32_t cp) {
   return cp <= JsonEscaping::kMaxCodePoint;
 }

 // Returns the low surrogate for the given unicode code point. The result is
 // meaningless if the given code point is not a supplementary character.
 inline uint16_t ToLowSurrogate(uint32_t cp) {
   return (cp &
           (JsonEscaping::kMaxLowSurrogate - JsonEscaping::kMinLowSurrogate)) +
          JsonEscaping::kMinLowSurrogate;
 }

 // Returns the high surrogate for the given unicode code point. The result is
 // meaningless if the given code point is not a supplementary character.
 inline uint16_t ToHighSurrogate(uint32_t cp) {
   return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
                        (JsonEscaping::kMinSupplementaryCodePoint >> 10));
 }

 // Input str is encoded in UTF-8. A unicode code point could be encoded in
 // UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
 // reads of the ByteSource.
 //
 // This function reads the next unicode code point from the input (str) at
 // the given position (index), taking into account any left-over partial
 // code point from the previous iteration (cp), together with the number
 // of characters left to read to complete this code point (num_left).
 //
 // This function assumes that the input (str) is valid at the given position
 // (index). In order words, at least one character could be read successfully.
 //
 // The code point read (partial or complete) is stored in (cp). Upon return,
 // (num_left) stores the number of characters that has yet to be read in
 // order to complete the current unicode code point. If the read is complete,
 // then (num_left) is 0. Also, (num_read) is the number of characters read.
 //
 // Returns false if we encounter an invalid UTF-8 string. Returns true
 // otherwise, including the case when we reach the end of the input (str)
 // before a complete unicode code point is read.
 bool ReadCodePoint(StringPiece str, int index, uint32_t* cp,
                    int* num_left, int* num_read) {
   if (*num_left == 0) {
     // Last read was complete. Start reading a new unicode code point.
     *cp = static_cast<uint8_t>(str[index++]);
     *num_read = 1;
     // The length of the code point is determined from reading the first byte.
     //
     // If the first byte is between:
     //    0..0x7f: that's the value of the code point.
     // 0x80..0xbf: <invalid>
     // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
     //                                   bit 10-6, bit 5-0
     // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
     //                        bit 15-12, bit 11-6, bit 5-0
     // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
     //             bit 20-18, bit 17-12, bit 11-6, bit 5-0
     // 0xf8..0xff: <invalid>
     //
     // Meaning of each bit:
     // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
     //              1 - multibyte code point
     //       bit 6: 0 - subsequent bytes of multibyte code point:
     //                  bits 5-0 are values.
     //              1 - first byte of multibyte code point
     //       bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
     //              1 - first byte of code point with >= 3 bytes.
     //       bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
     //              1 - first byte of code point with >= 4 bytes.
     //       bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
     //              1 - reserved for future expansion.
     if (*cp <= 0x7f) {
       return true;
     } else if (*cp <= 0xbf) {
       return false;
     } else if (*cp <= 0xdf) {
       *cp &= 0x1f;
       *num_left = 1;
     } else if (*cp <= 0xef) {
       *cp &= 0x0f;
       *num_left = 2;
     } else if (*cp <= 0xf7) {
       *cp &= 0x07;
       *num_left = 3;
     } else {
       return false;
     }
   } else {
     // Last read was partial. Initialize num_read to 0 and continue reading
     // the last unicode code point.
     *num_read = 0;
   }
   while (*num_left > 0 && index < str.size()) {
     uint32_t ch = static_cast<uint8_t>(str[index++]);
     --(*num_left);
     ++(*num_read);
     *cp = (*cp << 6) | (ch & 0x3f);
     if (ch < 0x80 || ch > 0xbf) return false;
   }
   return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp));
 }

 // Stores the 16-bit unicode code point as its hexadecimal digits in buffer
 // and returns a StringPiece that points to this buffer. The input buffer needs
 // to be at least 6 bytes long.
 StringPiece ToHex(uint16_t cp, char* buffer) {
   buffer[5] = kHex[cp & 0x0f];
   cp >>= 4;
   buffer[4] = kHex[cp & 0x0f];
   cp >>= 4;
   buffer[3] = kHex[cp & 0x0f];
   cp >>= 4;
   buffer[2] = kHex[cp & 0x0f];
   return StringPiece(buffer, 6);
 }

 // Stores the 32-bit unicode code point as its hexadecimal digits in buffer
 // and returns a StringPiece that points to this buffer. The input buffer needs
 // to be at least 12 bytes long.
 StringPiece ToSurrogateHex(uint32_t cp, char* buffer) {
   uint16_t low = ToLowSurrogate(cp);
   uint16_t high = ToHighSurrogate(cp);

   buffer[11] = kHex[low & 0x0f];
   low >>= 4;
   buffer[10] = kHex[low & 0x0f];
   low >>= 4;
   buffer[9] = kHex[low & 0x0f];
   low >>= 4;
   buffer[8] = kHex[low & 0x0f];

   buffer[5] = kHex[high & 0x0f];
   high >>= 4;
   buffer[4] = kHex[high & 0x0f];
   high >>= 4;
   buffer[3] = kHex[high & 0x0f];
   high >>= 4;
   buffer[2] = kHex[high & 0x0f];

   return StringPiece(buffer, 12);
 }

 // If the given unicode code point needs escaping, then returns the
 // escaped form. The returned StringPiece either points to statically
 // pre-allocated char[] or to the given buffer. The input buffer needs
 // to be at least 12 bytes long.
 //
 // If the given unicode code point does not need escaping, an empty
 // StringPiece is returned.
 StringPiece EscapeCodePoint(uint32_t cp, char* buffer) {
   if (cp < 0xa0) return kCommonEscapes[cp];
   switch (cp) {
     // These are not required by json spec
     // but used to prevent security bugs in javascript.
     case 0xfeff:  // Zero width no-break space
     case 0xfff9:  // Interlinear annotation anchor
     case 0xfffa:  // Interlinear annotation separator
     case 0xfffb:  // Interlinear annotation terminator

     case 0x00ad:  // Soft-hyphen
     case 0x06dd:  // Arabic end of ayah
     case 0x070f:  // Syriac abbreviation mark
     case 0x17b4:  // Khmer vowel inherent Aq
     case 0x17b5:  // Khmer vowel inherent Aa
       return ToHex(cp, buffer);

     default:
       if ((cp >= 0x0600 && cp <= 0x0603) ||  // Arabic signs
           (cp >= 0x200b && cp <= 0x200f) ||  // Zero width etc.
           (cp >= 0x2028 && cp <= 0x202e) ||  // Separators etc.
           (cp >= 0x2060 && cp <= 0x2064) ||  // Invisible etc.
           (cp >= 0x206a && cp <= 0x206f)) {  // Shaping etc.
         return ToHex(cp, buffer);
       }

       if (cp == 0x000e0001 ||                        // Language tag
           (cp >= 0x0001d173 && cp <= 0x0001d17a) ||  // Music formatting
           (cp >= 0x000e0020 && cp <= 0x000e007f)) {  // TAG symbols
         return ToSurrogateHex(cp, buffer);
       }
   }
   return StringPiece();
 }

 // Tries to escape the given code point first. If the given code point
 // does not need to be escaped, but force_output is true, then render
 // the given multi-byte code point in UTF8 in the buffer and returns it.
 StringPiece EscapeCodePoint(uint32_t cp, char* buffer,
                                   bool force_output) {
   StringPiece sp = EscapeCodePoint(cp, buffer);
   if (force_output && sp.empty()) {
     buffer[5] = (cp & 0x3f) | 0x80;
     cp >>= 6;
     if (cp <= 0x1f) {
       buffer[4] = cp | 0xc0;
       sp = StringPiece(buffer + 4, 2);
       return sp;
     }
     buffer[4] = (cp & 0x3f) | 0x80;
     cp >>= 6;
     if (cp <= 0x0f) {
       buffer[3] = cp | 0xe0;
       sp = StringPiece(buffer + 3, 3);
       return sp;
     }
     buffer[3] = (cp & 0x3f) | 0x80;
     buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
     sp = StringPiece(buffer + 2, 4);
   }
   return sp;
 }

 }  // namespace

 void JsonEscaping::Escape(strings::ByteSource* input,
                           strings::ByteSink* output) {
   char buffer[12] = "\\udead\\ubee";
   uint32_t cp = 0;   // Current unicode code point.
   int num_left = 0;  // Num of chars to read to complete the code point.
   while (input->Available() > 0) {
     StringPiece str = input->Peek();
     StringPiece escaped;
     int i = 0;
     int num_read;
     bool ok;
     bool cp_was_split = num_left > 0;
     // Loop until we encounter either
     //   i) a code point that needs to be escaped; or
     //  ii) a split code point is completely read; or
     // iii) a character that is not a valid utf8; or
     //  iv) end of the StringPiece str is reached.
     do {
       ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
       if (num_left > 0 || !ok) break;  // case iii or iv
       escaped = EscapeCodePoint(cp, buffer, cp_was_split);
       if (!escaped.empty()) break;  // case i or ii
       i += num_read;
       num_read = 0;
     } while (i < str.length());  // case iv
     // First copy the un-escaped prefix, if any, to the output ByteSink.
     if (i > 0) input->CopyTo(output, i);
     if (num_read > 0) input->Skip(num_read);
     if (!ok) {
       // Case iii: Report error.
       // TODO(wpoon): Add error reporting.
       num_left = 0;
     } else if (num_left == 0 && !escaped.empty()) {
       // Case i or ii: Append the escaped code point to the output ByteSink.
       output->Append(escaped.data(), escaped.size());
     }
   }
   if (num_left > 0) {
     // Treat as case iii: report error.
     // TODO(wpoon): Add error reporting.
   }
 }

 void JsonEscaping::Escape(StringPiece input, strings::ByteSink* output) {
   const size_t len = input.length();
   const char* p = input.data();

   bool can_skip_escaping = true;
   for (int i = 0; i < len; i++) {
     char c = p[i];
     if (c < 0x20 || c >= 0x7F || c == '"' || c == '<' || c == '>' ||
         c == '\\') {
       can_skip_escaping = false;
       break;
     }
   }

   if (can_skip_escaping) {
     output->Append(input.data(), input.length());
   } else {
     strings::ArrayByteSource source(input);
     Escape(&source, output);
   }
 }

 }  // namespace converter
 }  // namespace util
 }  // namespace protobuf
 }  // namespace google
	// Protocol Buffers - Google's data interchange format
	// Copyright 2008 Google Inc. All rights reserved.
	// https://developers.google.com/protocol-buffers/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above
	// copyright notice, this list of conditions and the following disclaimer
	// in the documentation and/or other materials provided with the
	// distribution.
	// * Neither the name of Google Inc. nor the names of its
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	#include <google/protobuf/util/internal/json_escaping.h>

	#include <cstdint>

	#include <google/protobuf/stubs/logging.h>
	#include <google/protobuf/stubs/common.h>

	namespace google {
	namespace protobuf {
	namespace util {
	namespace converter {

	namespace {

	// Array of hex characters for conversion to hex.
	static const char kHex[] = "0123456789abcdef";

	// Characters 0x00 to 0x9f are very commonly used, so we provide a special
	// table lookup.
	//
	// For unicode code point ch < 0xa0:
	// kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
	// or an empty string, if escaping is not needed.
	static const char kCommonEscapes[160][7] = {
	// C0 (ASCII and derivatives) control characters
	"\\u0000", "\\u0001", "\\u0002", "\\u0003", // 0x00
	"\\u0004", "\\u0005", "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000b",
	"\\f", "\\r", "\\u000e", "\\u000f", "\\u0010", "\\u0011", "\\u0012",
	"\\u0013", // 0x10
	"\\u0014", "\\u0015", "\\u0016", "\\u0017", "\\u0018", "\\u0019", "\\u001a",
	"\\u001b", "\\u001c", "\\u001d", "\\u001e", "\\u001f",
	// Escaping of " and \ are required by www.json.org string definition.
	// Escaping of < and > are required for HTML security.
	"", "", "\\\"", "", "", "", "", "", // 0x20
	"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x30
	"", "", "", "", "\\u003c", "", "\\u003e", "", "", "", "", "", "", "", "",
	"", // 0x40
	"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x50
	"", "", "", "", "\\\\", "", "", "", "", "", "", "", "", "", "", "", // 0x60
	"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", // 0x70
	"", "", "", "", "", "", "", "\\u007f",
	// C1 (ISO 8859 and Unicode) extended control characters
	"\\u0080", "\\u0081", "\\u0082", "\\u0083", // 0x80
	"\\u0084", "\\u0085", "\\u0086", "\\u0087", "\\u0088", "\\u0089", "\\u008a",
	"\\u008b", "\\u008c", "\\u008d", "\\u008e", "\\u008f", "\\u0090", "\\u0091",
	"\\u0092", "\\u0093", // 0x90
	"\\u0094", "\\u0095", "\\u0096", "\\u0097", "\\u0098", "\\u0099", "\\u009a",
	"\\u009b", "\\u009c", "\\u009d", "\\u009e", "\\u009f"};

	// Determines if the given char value is a unicode surrogate code unit (either
	// high-surrogate or low-surrogate).
	inline bool IsSurrogate(uint32_t c) {
	// Optimized form of:
	// return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
	// (Reduced from 3 ALU instructions to 2 ALU instructions)
	return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
	}

	// Returns true if the given unicode code point cp is a valid
	// unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
	inline bool IsValidCodePoint(uint32_t cp) {
	return cp <= JsonEscaping::kMaxCodePoint;
	}

	// Returns the low surrogate for the given unicode code point. The result is
	// meaningless if the given code point is not a supplementary character.
	inline uint16_t ToLowSurrogate(uint32_t cp) {
	return (cp &
	(JsonEscaping::kMaxLowSurrogate - JsonEscaping::kMinLowSurrogate)) +
	JsonEscaping::kMinLowSurrogate;
	}

	// Returns the high surrogate for the given unicode code point. The result is
	// meaningless if the given code point is not a supplementary character.
	inline uint16_t ToHighSurrogate(uint32_t cp) {
	return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
	(JsonEscaping::kMinSupplementaryCodePoint >> 10));
	}

	// Input str is encoded in UTF-8. A unicode code point could be encoded in
	// UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
	// reads of the ByteSource.
	//
	// This function reads the next unicode code point from the input (str) at
	// the given position (index), taking into account any left-over partial
	// code point from the previous iteration (cp), together with the number
	// of characters left to read to complete this code point (num_left).
	//
	// This function assumes that the input (str) is valid at the given position
	// (index). In order words, at least one character could be read successfully.
	//
	// The code point read (partial or complete) is stored in (cp). Upon return,
	// (num_left) stores the number of characters that has yet to be read in
	// order to complete the current unicode code point. If the read is complete,
	// then (num_left) is 0. Also, (num_read) is the number of characters read.
	//
	// Returns false if we encounter an invalid UTF-8 string. Returns true
	// otherwise, including the case when we reach the end of the input (str)
	// before a complete unicode code point is read.
	bool ReadCodePoint(StringPiece str, int index, uint32_t* cp,
	int* num_left, int* num_read) {
	if (*num_left == 0) {
	// Last read was complete. Start reading a new unicode code point.
	*cp = static_cast<uint8_t>(str[index++]);
	*num_read = 1;
	// The length of the code point is determined from reading the first byte.
	//
	// If the first byte is between:
	// 0..0x7f: that's the value of the code point.
	// 0x80..0xbf: <invalid>
	// 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
	// bit 10-6, bit 5-0
	// 0xe0..0xef: 16-bit code point encoded in 3 bytes.
	// bit 15-12, bit 11-6, bit 5-0
	// 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
	// bit 20-18, bit 17-12, bit 11-6, bit 5-0
	// 0xf8..0xff: <invalid>
	//
	// Meaning of each bit:
	// <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
	// 1 - multibyte code point
	// bit 6: 0 - subsequent bytes of multibyte code point:
	// bits 5-0 are values.
	// 1 - first byte of multibyte code point
	// bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
	// 1 - first byte of code point with >= 3 bytes.
	// bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
	// 1 - first byte of code point with >= 4 bytes.
	// bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
	// 1 - reserved for future expansion.
	if (*cp <= 0x7f) {
	return true;
	} else if (*cp <= 0xbf) {
	return false;
	} else if (*cp <= 0xdf) {
	*cp &= 0x1f;
	*num_left = 1;
	} else if (*cp <= 0xef) {
	*cp &= 0x0f;
	*num_left = 2;
	} else if (*cp <= 0xf7) {
	*cp &= 0x07;
	*num_left = 3;
	} else {
	return false;
	}
	} else {
	// Last read was partial. Initialize num_read to 0 and continue reading
	// the last unicode code point.
	*num_read = 0;
	}
	while (*num_left > 0 && index < str.size()) {
	uint32_t ch = static_cast<uint8_t>(str[index++]);
	--(*num_left);
	++(*num_read);
	cp = (cp << 6) \| (ch & 0x3f);
	if (ch < 0x80 \|\| ch > 0xbf) return false;
	}
	return num_left > 0 \|\| (!IsSurrogate(cp) && IsValidCodePoint(*cp));
	}

	// Stores the 16-bit unicode code point as its hexadecimal digits in buffer
	// and returns a StringPiece that points to this buffer. The input buffer needs
	// to be at least 6 bytes long.
	StringPiece ToHex(uint16_t cp, char* buffer) {
	buffer[5] = kHex[cp & 0x0f];
	cp >>= 4;
	buffer[4] = kHex[cp & 0x0f];
	cp >>= 4;
	buffer[3] = kHex[cp & 0x0f];
	cp >>= 4;
	buffer[2] = kHex[cp & 0x0f];
	return StringPiece(buffer, 6);
	}

	// Stores the 32-bit unicode code point as its hexadecimal digits in buffer
	// and returns a StringPiece that points to this buffer. The input buffer needs
	// to be at least 12 bytes long.
	StringPiece ToSurrogateHex(uint32_t cp, char* buffer) {
	uint16_t low = ToLowSurrogate(cp);
	uint16_t high = ToHighSurrogate(cp);

	buffer[11] = kHex[low & 0x0f];
	low >>= 4;
	buffer[10] = kHex[low & 0x0f];
	low >>= 4;
	buffer[9] = kHex[low & 0x0f];
	low >>= 4;
	buffer[8] = kHex[low & 0x0f];

	buffer[5] = kHex[high & 0x0f];
	high >>= 4;
	buffer[4] = kHex[high & 0x0f];
	high >>= 4;
	buffer[3] = kHex[high & 0x0f];
	high >>= 4;
	buffer[2] = kHex[high & 0x0f];

	return StringPiece(buffer, 12);
	}

	// If the given unicode code point needs escaping, then returns the
	// escaped form. The returned StringPiece either points to statically
	// pre-allocated char[] or to the given buffer. The input buffer needs
	// to be at least 12 bytes long.
	//
	// If the given unicode code point does not need escaping, an empty
	// StringPiece is returned.
	StringPiece EscapeCodePoint(uint32_t cp, char* buffer) {
	if (cp < 0xa0) return kCommonEscapes[cp];
	switch (cp) {
	// These are not required by json spec
	// but used to prevent security bugs in javascript.
	case 0xfeff: // Zero width no-break space
	case 0xfff9: // Interlinear annotation anchor
	case 0xfffa: // Interlinear annotation separator
	case 0xfffb: // Interlinear annotation terminator

	case 0x00ad: // Soft-hyphen
	case 0x06dd: // Arabic end of ayah
	case 0x070f: // Syriac abbreviation mark
	case 0x17b4: // Khmer vowel inherent Aq
	case 0x17b5: // Khmer vowel inherent Aa
	return ToHex(cp, buffer);

	default:
	if ((cp >= 0x0600 && cp <= 0x0603) \|\| // Arabic signs
	(cp >= 0x200b && cp <= 0x200f) \|\| // Zero width etc.
	(cp >= 0x2028 && cp <= 0x202e) \|\| // Separators etc.
	(cp >= 0x2060 && cp <= 0x2064) \|\| // Invisible etc.
	(cp >= 0x206a && cp <= 0x206f)) { // Shaping etc.
	return ToHex(cp, buffer);
	}

	if (cp == 0x000e0001 \|\| // Language tag
	(cp >= 0x0001d173 && cp <= 0x0001d17a) \|\| // Music formatting
	(cp >= 0x000e0020 && cp <= 0x000e007f)) { // TAG symbols
	return ToSurrogateHex(cp, buffer);
	}
	}
	return StringPiece();
	}

	// Tries to escape the given code point first. If the given code point
	// does not need to be escaped, but force_output is true, then render
	// the given multi-byte code point in UTF8 in the buffer and returns it.
	StringPiece EscapeCodePoint(uint32_t cp, char* buffer,
	bool force_output) {
	StringPiece sp = EscapeCodePoint(cp, buffer);
	if (force_output && sp.empty()) {
	buffer[5] = (cp & 0x3f) \| 0x80;
	cp >>= 6;
	if (cp <= 0x1f) {
	buffer[4] = cp \| 0xc0;
	sp = StringPiece(buffer + 4, 2);
	return sp;
	}
	buffer[4] = (cp & 0x3f) \| 0x80;
	cp >>= 6;
	if (cp <= 0x0f) {
	buffer[3] = cp \| 0xe0;
	sp = StringPiece(buffer + 3, 3);
	return sp;
	}
	buffer[3] = (cp & 0x3f) \| 0x80;
	buffer[2] = ((cp >> 6) & 0x07) \| 0xf0;
	sp = StringPiece(buffer + 2, 4);
	}
	return sp;
	}

	} // namespace

	void JsonEscaping::Escape(strings::ByteSource* input,
	strings::ByteSink* output) {
	char buffer[12] = "\\udead\\ubee";
	uint32_t cp = 0; // Current unicode code point.
	int num_left = 0; // Num of chars to read to complete the code point.
	while (input->Available() > 0) {
	StringPiece str = input->Peek();
	StringPiece escaped;
	int i = 0;
	int num_read;
	bool ok;
	bool cp_was_split = num_left > 0;
	// Loop until we encounter either
	// i) a code point that needs to be escaped; or
	// ii) a split code point is completely read; or
	// iii) a character that is not a valid utf8; or
	// iv) end of the StringPiece str is reached.
	do {
	ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
	if (num_left > 0 \|\| !ok) break; // case iii or iv
	escaped = EscapeCodePoint(cp, buffer, cp_was_split);
	if (!escaped.empty()) break; // case i or ii
	i += num_read;
	num_read = 0;
	} while (i < str.length()); // case iv
	// First copy the un-escaped prefix, if any, to the output ByteSink.
	if (i > 0) input->CopyTo(output, i);
	if (num_read > 0) input->Skip(num_read);
	if (!ok) {
	// Case iii: Report error.
	// TODO(wpoon): Add error reporting.
	num_left = 0;
	} else if (num_left == 0 && !escaped.empty()) {
	// Case i or ii: Append the escaped code point to the output ByteSink.
	output->Append(escaped.data(), escaped.size());
	}
	}
	if (num_left > 0) {
	// Treat as case iii: report error.
	// TODO(wpoon): Add error reporting.
	}
	}

	void JsonEscaping::Escape(StringPiece input, strings::ByteSink* output) {
	const size_t len = input.length();
	const char* p = input.data();

	bool can_skip_escaping = true;
	for (int i = 0; i < len; i++) {
	char c = p[i];
	if (c < 0x20 \|\| c >= 0x7F \|\| c == '"' \|\| c == '<' \|\| c == '>' \|\|
	c == '\\') {
	can_skip_escaping = false;
	break;
	}
	}

	if (can_skip_escaping) {
	output->Append(input.data(), input.length());
	} else {
	strings::ArrayByteSource source(input);
	Escape(&source, output);
	}
	}

	} // namespace converter
	} // namespace util
	} // namespace protobuf
	} // namespace google