Source/wtf/text/TextCodecUTF8.cpp - platform/external/chromium_org/third_party/WebKit - Git at Google

 /*
  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "config.h"
 #include "wtf/text/TextCodecUTF8.h"

 #include "wtf/text/TextCodecASCIIFastPath.h"
 #include "wtf/text/CString.h"
 #include "wtf/text/StringBuffer.h"
 #include "wtf/unicode/CharacterNames.h"

 using namespace WTF;
 using namespace WTF::Unicode;
 using namespace std;

 namespace WTF {

 const int nonCharacter = -1;

 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
 {
     return adoptPtr(new TextCodecUTF8);
 }

 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
 {
     registrar("UTF-8", "UTF-8");

     // Additional aliases that originally were present in the encoding
     // table in WebKit on Macintosh, and subsequently added by
     // TextCodecICU. Perhaps we can prove some are not used on the web
     // and remove them.
     registrar("unicode11utf8", "UTF-8");
     registrar("unicode20utf8", "UTF-8");
     registrar("utf8", "UTF-8");
     registrar("x-unicode20utf8", "UTF-8");
 }

 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
 {
     registrar("UTF-8", create, 0);
 }

 static inline int nonASCIISequenceLength(uint8_t firstByte)
 {
     static const uint8_t lengths[256] = {
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     };
     return lengths[firstByte];
 }

 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
 {
     ASSERT(!isASCII(sequence[0]));
     if (length == 2) {
         ASSERT(sequence[0] <= 0xDF);
         if (sequence[0] < 0xC2)
             return nonCharacter;
         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
             return nonCharacter;
         return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
     }
     if (length == 3) {
         ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
         switch (sequence[0]) {
         case 0xE0:
             if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
                 return nonCharacter;
             break;
         case 0xED:
             if (sequence[1] < 0x80 || sequence[1] > 0x9F)
                 return nonCharacter;
             break;
         default:
             if (sequence[1] < 0x80 || sequence[1] > 0xBF)
                 return nonCharacter;
         }
         if (sequence[2] < 0x80 || sequence[2] > 0xBF)
             return nonCharacter;
         return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
     }
     ASSERT(length == 4);
     ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
     switch (sequence[0]) {
     case 0xF0:
         if (sequence[1] < 0x90 || sequence[1] > 0xBF)
             return nonCharacter;
         break;
     case 0xF4:
         if (sequence[1] < 0x80 || sequence[1] > 0x8F)
             return nonCharacter;
         break;
     default:
         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
             return nonCharacter;
     }
     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
         return nonCharacter;
     if (sequence[3] < 0x80 || sequence[3] > 0xBF)
         return nonCharacter;
     return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
 }

 static inline UChar* appendCharacter(UChar* destination, int character)
 {
     ASSERT(character != nonCharacter);
     ASSERT(!U_IS_SURROGATE(character));
     if (U_IS_BMP(character))
         *destination++ = character;
     else {
         *destination++ = U16_LEAD(character);
         *destination++ = U16_TRAIL(character);
     }
     return destination;
 }

 void TextCodecUTF8::consumePartialSequenceByte()
 {
     --m_partialSequenceSize;
     memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
 }

 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
 {
     sawError = true;
     if (stopOnError)
         return;
     // Each error generates a replacement character and consumes one byte.
     *destination++ = replacementCharacter;
     consumePartialSequenceByte();
 }

 template <>
 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
 {
     ASSERT(m_partialSequenceSize);
     do {
         if (isASCII(m_partialSequence[0])) {
             *destination++ = m_partialSequence[0];
             consumePartialSequenceByte();
             continue;
         }
         int count = nonASCIISequenceLength(m_partialSequence[0]);
         if (!count)
             return true;

         if (count > m_partialSequenceSize) {
             if (count - m_partialSequenceSize > end - source) {
                 if (!flush) {
                     // The new data is not enough to complete the sequence, so
                     // add it to the existing partial sequence.
                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
                     m_partialSequenceSize += end - source;
                     return false;
                 }
                 // An incomplete partial sequence at the end is an error, but it will create
                 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
                 // the error.
                 return true;
             }
             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
             source += count - m_partialSequenceSize;
             m_partialSequenceSize = count;
         }
         int character = decodeNonASCIISequence(m_partialSequence, count);
         if ((character == nonCharacter) || (character > 0xff))
             return true;

         m_partialSequenceSize -= count;
         *destination++ = character;
     } while (m_partialSequenceSize);

     return false;
 }

 template <>
 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
 {
     ASSERT(m_partialSequenceSize);
     do {
         if (isASCII(m_partialSequence[0])) {
             *destination++ = m_partialSequence[0];
             consumePartialSequenceByte();
             continue;
         }
         int count = nonASCIISequenceLength(m_partialSequence[0]);
         if (!count) {
             handleError(destination, stopOnError, sawError);
             if (stopOnError)
                 return false;
             continue;
         }
         if (count > m_partialSequenceSize) {
             if (count - m_partialSequenceSize > end - source) {
                 if (!flush) {
                     // The new data is not enough to complete the sequence, so
                     // add it to the existing partial sequence.
                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
                     m_partialSequenceSize += end - source;
                     return false;
                 }
                 // An incomplete partial sequence at the end is an error.
                 handleError(destination, stopOnError, sawError);
                 if (stopOnError)
                     return false;
                 continue;
             }
             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
             source += count - m_partialSequenceSize;
             m_partialSequenceSize = count;
         }
         int character = decodeNonASCIISequence(m_partialSequence, count);
         if (character == nonCharacter) {
             handleError(destination, stopOnError, sawError);
             if (stopOnError)
                 return false;
             continue;
         }

         m_partialSequenceSize -= count;
         destination = appendCharacter(destination, character);
     } while (m_partialSequenceSize);

     return false;
 }

 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
 {
     // Each input byte might turn into a character.
     // That includes all bytes in the partial-sequence buffer because
     // each byte in an invalid sequence will turn into a replacement character.
     StringBuffer<LChar> buffer(m_partialSequenceSize + length);

     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
     const uint8_t* end = source + length;
     const uint8_t* alignedEnd = alignToMachineWord(end);
     LChar* destination = buffer.characters();

     do {
         if (m_partialSequenceSize) {
             // Explicitly copy destination and source pointers to avoid taking pointers to the
             // local variables, which may harm code generation by disabling some optimizations
             // in some compilers.
             LChar* destinationForHandlePartialSequence = destination;
             const uint8_t* sourceForHandlePartialSequence = source;
             if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
                 source = sourceForHandlePartialSequence;
                 goto upConvertTo16Bit;
             }
             destination = destinationForHandlePartialSequence;
             source = sourceForHandlePartialSequence;
             if (m_partialSequenceSize)
                 break;
         }

         while (source < end) {
             if (isASCII(*source)) {
                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
                 if (isAlignedToMachineWord(source)) {
                     while (source < alignedEnd) {
                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
                         if (!isAllASCII<LChar>(chunk))
                             break;
                         copyASCIIMachineWord(destination, source);
                         source += sizeof(MachineWord);
                         destination += sizeof(MachineWord);
                     }
                     if (source == end)
                         break;
                     if (!isASCII(*source))
                         continue;
                 }
                 *destination++ = *source++;
                 continue;
             }
             int count = nonASCIISequenceLength(*source);
             int character;
             if (!count)
                 character = nonCharacter;
             else {
                 if (count > end - source) {
                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
                     ASSERT(!m_partialSequenceSize);
                     m_partialSequenceSize = end - source;
                     memcpy(m_partialSequence, source, m_partialSequenceSize);
                     source = end;
                     break;
                 }
                 character = decodeNonASCIISequence(source, count);
             }
             if (character == nonCharacter) {
                 sawError = true;
                 if (stopOnError)
                     break;

                 goto upConvertTo16Bit;
             }
             if (character > 0xff)
                 goto upConvertTo16Bit;

             source += count;
             *destination++ = character;
         }
     } while (flush && m_partialSequenceSize);

     buffer.shrink(destination - buffer.characters());

     return String::adopt(buffer);

 upConvertTo16Bit:
     StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

     UChar* destination16 = buffer16.characters();

     // Copy the already converted characters
     for (LChar* converted8 = buffer.characters(); converted8 < destination;)
         *destination16++ = *converted8++;

     do {
         if (m_partialSequenceSize) {
             // Explicitly copy destination and source pointers to avoid taking pointers to the
             // local variables, which may harm code generation by disabling some optimizations
             // in some compilers.
             UChar* destinationForHandlePartialSequence = destination16;
             const uint8_t* sourceForHandlePartialSequence = source;
             handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
             destination16 = destinationForHandlePartialSequence;
             source = sourceForHandlePartialSequence;
             if (m_partialSequenceSize)
                 break;
         }

         while (source < end) {
             if (isASCII(*source)) {
                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
                 if (isAlignedToMachineWord(source)) {
                     while (source < alignedEnd) {
                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
                         if (!isAllASCII<LChar>(chunk))
                             break;
                         copyASCIIMachineWord(destination16, source);
                         source += sizeof(MachineWord);
                         destination16 += sizeof(MachineWord);
                     }
                     if (source == end)
                         break;
                     if (!isASCII(*source))
                         continue;
                 }
                 *destination16++ = *source++;
                 continue;
             }
             int count = nonASCIISequenceLength(*source);
             int character;
             if (!count)
                 character = nonCharacter;
             else {
                 if (count > end - source) {
                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
                     ASSERT(!m_partialSequenceSize);
                     m_partialSequenceSize = end - source;
                     memcpy(m_partialSequence, source, m_partialSequenceSize);
                     source = end;
                     break;
                 }
                 character = decodeNonASCIISequence(source, count);
             }
             if (character == nonCharacter) {
                 sawError = true;
                 if (stopOnError)
                     break;
                 // Each error generates a replacement character and consumes one byte.
                 *destination16++ = replacementCharacter;
                 ++source;
                 continue;
             }
             source += count;
             destination16 = appendCharacter(destination16, character);
         }
     } while (flush && m_partialSequenceSize);

     buffer16.shrink(destination16 - buffer16.characters());

     return String::adopt(buffer16);
 }

 template<typename CharType>
 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
 {
     // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
     // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
     // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
     if (length > numeric_limits<size_t>::max() / 3)
         CRASH();
     Vector<uint8_t> bytes(length * 3);

     size_t i = 0;
     size_t bytesWritten = 0;
     while (i < length) {
         UChar32 character;
         U16_NEXT(characters, i, length, character);
         // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
         // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
         if (0xD800 <= character && character <= 0xDFFF)
             character = replacementCharacter;
         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
     }

     return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
 }

 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
 {
     return encodeCommon(characters, length);
 }

 CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
 {
     return encodeCommon(characters, length);
 }

 } // namespace WTF
	/*
	* Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "config.h"
	#include "wtf/text/TextCodecUTF8.h"

	#include "wtf/text/TextCodecASCIIFastPath.h"
	#include "wtf/text/CString.h"
	#include "wtf/text/StringBuffer.h"
	#include "wtf/unicode/CharacterNames.h"

	using namespace WTF;
	using namespace WTF::Unicode;
	using namespace std;

	namespace WTF {

	const int nonCharacter = -1;

	PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
	{
	return adoptPtr(new TextCodecUTF8);
	}

	void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
	{
	registrar("UTF-8", "UTF-8");

	// Additional aliases that originally were present in the encoding
	// table in WebKit on Macintosh, and subsequently added by
	// TextCodecICU. Perhaps we can prove some are not used on the web
	// and remove them.
	registrar("unicode11utf8", "UTF-8");
	registrar("unicode20utf8", "UTF-8");
	registrar("utf8", "UTF-8");
	registrar("x-unicode20utf8", "UTF-8");
	}

	void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
	{
	registrar("UTF-8", create, 0);
	}

	static inline int nonASCIISequenceLength(uint8_t firstByte)
	{
	static const uint8_t lengths[256] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
	4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
	};
	return lengths[firstByte];
	}

	static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
	{
	ASSERT(!isASCII(sequence[0]));
	if (length == 2) {
	ASSERT(sequence[0] <= 0xDF);
	if (sequence[0] < 0xC2)
	return nonCharacter;
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
	}
	if (length == 3) {
	ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
	switch (sequence[0]) {
	case 0xE0:
	if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	break;
	case 0xED:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)
	return nonCharacter;
	break;
	default:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	}
	if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)
	return nonCharacter;
	return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
	}
	ASSERT(length == 4);
	ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
	switch (sequence[0]) {
	case 0xF0:
	if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	break;
	case 0xF4:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)
	return nonCharacter;
	break;
	default:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return nonCharacter;
	}
	if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)
	return nonCharacter;
	if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)
	return nonCharacter;
	return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
	}

	static inline UChar* appendCharacter(UChar* destination, int character)
	{
	ASSERT(character != nonCharacter);
	ASSERT(!U_IS_SURROGATE(character));
	if (U_IS_BMP(character))
	*destination++ = character;
	else {
	*destination++ = U16_LEAD(character);
	*destination++ = U16_TRAIL(character);
	}
	return destination;
	}

	void TextCodecUTF8::consumePartialSequenceByte()
	{
	--m_partialSequenceSize;
	memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
	}

	void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
	{
	sawError = true;
	if (stopOnError)
	return;
	// Each error generates a replacement character and consumes one byte.
	*destination++ = replacementCharacter;
	consumePartialSequenceByte();
	}

	template <>
	bool TextCodecUTF8::handlePartialSequence<LChar>(LChar& destination, const uint8_t& source, const uint8_t* end, bool flush, bool, bool&)
	{
	ASSERT(m_partialSequenceSize);
	do {
	if (isASCII(m_partialSequence[0])) {
	*destination++ = m_partialSequence[0];
	consumePartialSequenceByte();
	continue;
	}
	int count = nonASCIISequenceLength(m_partialSequence[0]);
	if (!count)
	return true;

	if (count > m_partialSequenceSize) {
	if (count - m_partialSequenceSize > end - source) {
	if (!flush) {
	// The new data is not enough to complete the sequence, so
	// add it to the existing partial sequence.
	memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
	m_partialSequenceSize += end - source;
	return false;
	}
	// An incomplete partial sequence at the end is an error, but it will create
	// a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
	// the error.
	return true;
	}
	memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
	source += count - m_partialSequenceSize;
	m_partialSequenceSize = count;
	}
	int character = decodeNonASCIISequence(m_partialSequence, count);
	if ((character == nonCharacter) \|\| (character > 0xff))
	return true;

	m_partialSequenceSize -= count;
	*destination++ = character;
	} while (m_partialSequenceSize);

	return false;
	}

	template <>
	bool TextCodecUTF8::handlePartialSequence<UChar>(UChar& destination, const uint8_t& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
	{
	ASSERT(m_partialSequenceSize);
	do {
	if (isASCII(m_partialSequence[0])) {
	*destination++ = m_partialSequence[0];
	consumePartialSequenceByte();
	continue;
	}
	int count = nonASCIISequenceLength(m_partialSequence[0]);
	if (!count) {
	handleError(destination, stopOnError, sawError);
	if (stopOnError)
	return false;
	continue;
	}
	if (count > m_partialSequenceSize) {
	if (count - m_partialSequenceSize > end - source) {
	if (!flush) {
	// The new data is not enough to complete the sequence, so
	// add it to the existing partial sequence.
	memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
	m_partialSequenceSize += end - source;
	return false;
	}
	// An incomplete partial sequence at the end is an error.
	handleError(destination, stopOnError, sawError);
	if (stopOnError)
	return false;
	continue;
	}
	memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
	source += count - m_partialSequenceSize;
	m_partialSequenceSize = count;
	}
	int character = decodeNonASCIISequence(m_partialSequence, count);
	if (character == nonCharacter) {
	handleError(destination, stopOnError, sawError);
	if (stopOnError)
	return false;
	continue;
	}

	m_partialSequenceSize -= count;
	destination = appendCharacter(destination, character);
	} while (m_partialSequenceSize);

	return false;
	}

	String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
	{
	// Each input byte might turn into a character.
	// That includes all bytes in the partial-sequence buffer because
	// each byte in an invalid sequence will turn into a replacement character.
	StringBuffer<LChar> buffer(m_partialSequenceSize + length);

	const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
	const uint8_t* end = source + length;
	const uint8_t* alignedEnd = alignToMachineWord(end);
	LChar* destination = buffer.characters();

	do {
	if (m_partialSequenceSize) {
	// Explicitly copy destination and source pointers to avoid taking pointers to the
	// local variables, which may harm code generation by disabling some optimizations
	// in some compilers.
	LChar* destinationForHandlePartialSequence = destination;
	const uint8_t* sourceForHandlePartialSequence = source;
	if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
	source = sourceForHandlePartialSequence;
	goto upConvertTo16Bit;
	}
	destination = destinationForHandlePartialSequence;
	source = sourceForHandlePartialSequence;
	if (m_partialSequenceSize)
	break;
	}

	while (source < end) {
	if (isASCII(*source)) {
	// Fast path for ASCII. Most UTF-8 text will be ASCII.
	if (isAlignedToMachineWord(source)) {
	while (source < alignedEnd) {
	MachineWord chunk = reinterpret_cast_ptr<const MachineWord>(source);
	if (!isAllASCII<LChar>(chunk))
	break;
	copyASCIIMachineWord(destination, source);
	source += sizeof(MachineWord);
	destination += sizeof(MachineWord);
	}
	if (source == end)
	break;
	if (!isASCII(*source))
	continue;
	}
	destination++ = source++;
	continue;
	}
	int count = nonASCIISequenceLength(*source);
	int character;
	if (!count)
	character = nonCharacter;
	else {
	if (count > end - source) {
	ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
	ASSERT(!m_partialSequenceSize);
	m_partialSequenceSize = end - source;
	memcpy(m_partialSequence, source, m_partialSequenceSize);
	source = end;
	break;
	}
	character = decodeNonASCIISequence(source, count);
	}
	if (character == nonCharacter) {
	sawError = true;
	if (stopOnError)
	break;

	goto upConvertTo16Bit;
	}
	if (character > 0xff)
	goto upConvertTo16Bit;

	source += count;
	*destination++ = character;
	}
	} while (flush && m_partialSequenceSize);

	buffer.shrink(destination - buffer.characters());

	return String::adopt(buffer);

	upConvertTo16Bit:
	StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

	UChar* destination16 = buffer16.characters();

	// Copy the already converted characters
	for (LChar* converted8 = buffer.characters(); converted8 < destination;)
	destination16++ = converted8++;

	do {
	if (m_partialSequenceSize) {
	// Explicitly copy destination and source pointers to avoid taking pointers to the
	// local variables, which may harm code generation by disabling some optimizations
	// in some compilers.
	UChar* destinationForHandlePartialSequence = destination16;
	const uint8_t* sourceForHandlePartialSequence = source;
	handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
	destination16 = destinationForHandlePartialSequence;
	source = sourceForHandlePartialSequence;
	if (m_partialSequenceSize)
	break;
	}

	while (source < end) {
	if (isASCII(*source)) {
	// Fast path for ASCII. Most UTF-8 text will be ASCII.
	if (isAlignedToMachineWord(source)) {
	while (source < alignedEnd) {
	MachineWord chunk = reinterpret_cast_ptr<const MachineWord>(source);
	if (!isAllASCII<LChar>(chunk))
	break;
	copyASCIIMachineWord(destination16, source);
	source += sizeof(MachineWord);
	destination16 += sizeof(MachineWord);
	}
	if (source == end)
	break;
	if (!isASCII(*source))
	continue;
	}
	destination16++ = source++;
	continue;
	}
	int count = nonASCIISequenceLength(*source);
	int character;
	if (!count)
	character = nonCharacter;
	else {
	if (count > end - source) {
	ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
	ASSERT(!m_partialSequenceSize);
	m_partialSequenceSize = end - source;
	memcpy(m_partialSequence, source, m_partialSequenceSize);
	source = end;
	break;
	}
	character = decodeNonASCIISequence(source, count);
	}
	if (character == nonCharacter) {
	sawError = true;
	if (stopOnError)
	break;
	// Each error generates a replacement character and consumes one byte.
	*destination16++ = replacementCharacter;
	++source;
	continue;
	}
	source += count;
	destination16 = appendCharacter(destination16, character);
	}
	} while (flush && m_partialSequenceSize);

	buffer16.shrink(destination16 - buffer16.characters());

	return String::adopt(buffer16);
	}

	template<typename CharType>
	CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
	{
	// The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
	// BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
	// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
	if (length > numeric_limits<size_t>::max() / 3)
	CRASH();
	Vector<uint8_t> bytes(length * 3);

	size_t i = 0;
	size_t bytesWritten = 0;
	while (i < length) {
	UChar32 character;
	U16_NEXT(characters, i, length, character);
	// U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
	// is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
	if (0xD800 <= character && character <= 0xDFFF)
	character = replacementCharacter;
	U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
	}

	return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
	}

	CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
	{
	return encodeCommon(characters, length);
	}

	CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
	{
	return encodeCommon(characters, length);
	}

	} // namespace WTF