utf/unicodetext.cc - platform/external/chromium_org/third_party/libphonenumber/src/phonenumbers - Git at Google

 // Copyright (C) 2006 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Author: Jim Meehan

 #include <algorithm>
 #include <sstream>
 #include <cassert>
 #include <cstdio>

 #include "phonenumbers/utf/unicodetext.h"
 #include "phonenumbers/utf/stringpiece.h"
 //#include "utf/stringprintf.h"
 #include "phonenumbers/utf/utf.h"
 #include "phonenumbers/utf/unilib.h"

 namespace i18n {
 namespace phonenumbers {

 using std::stringstream;
 using std::max;
 using std::hex;
 using std::dec;

 static int CodepointDistance(const char* start, const char* end) {
   int n = 0;
   // Increment n on every non-trail-byte.
   for (const char* p = start; p < end; ++p) {
     n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
   }
   return n;
 }

 static int CodepointCount(const char* utf8, int len) {
   return CodepointDistance(utf8, utf8 + len);
 }

 UnicodeText::const_iterator::difference_type
 distance(const UnicodeText::const_iterator& first,
          const UnicodeText::const_iterator& last) {
   return CodepointDistance(first.it_, last.it_);
 }

 // ---------- Utility ----------

 static int ConvertToInterchangeValid(char* start, int len) {
   // This routine is called only when we've discovered that a UTF-8 buffer
   // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
   // was not interchange valid. This indicates a bug in the caller, and
   // a LOG(WARNING) is done in that case.
   // This is similar to CoerceToInterchangeValid, but it replaces each
   // structurally valid byte with a space, and each non-interchange
   // character with a space, even when that character requires more
   // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
   // structurally valid UTF8, but U+FDD0 is not an interchange-valid
   // code point. The result should contain one space, not three.
   //
   // Since the conversion never needs to write more data than it
   // reads, it is safe to change the buffer in place. It returns the
   // number of bytes written.
   char* const in = start;
   char* out = start;
   char* const end = start + len;
   while (start < end) {
     int good = UniLib::SpanInterchangeValid(start, end - start);
     if (good > 0) {
       if (out != start) {
         memmove(out, start, good);
       }
       out += good;
       start += good;
       if (start == end) {
         break;
       }
     }
     // Is the current string invalid UTF8 or just non-interchange UTF8?
     char32 rune;
     int n;
     if (isvalidcharntorune(start, end - start, &rune, &n)) {
       // structurally valid UTF8, but not interchange valid
       start += n;  // Skip over the whole character.
     } else {  // bad UTF8
       start += 1;  // Skip over just one byte
     }
     *out++ = ' ';
   }
   return out - in;
 }


 // *************** Data representation **********

 // Note: the copy constructor is undefined.

 // After reserve(), resize(), or clear(), we're an owner, not an alias.

 void UnicodeText::Repr::reserve(int new_capacity) {
   // If there's already enough capacity, and we're an owner, do nothing.
   if (capacity_ >= new_capacity && ours_) return;

   // Otherwise, allocate a new buffer.
   capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
   char* new_data = new char[capacity_];

   // If there is an old buffer, copy it into the new buffer.
   if (data_) {
     memcpy(new_data, data_, size_);
     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
   }
   data_ = new_data;
   ours_ = true;  // We own the new buffer.
   // size_ is unchanged.
 }

 void UnicodeText::Repr::resize(int new_size) {
   if (new_size == 0) {
     clear();
   } else {
     if (!ours_ || new_size > capacity_) reserve(new_size);
     // Clear the memory in the expanded part.
     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
     size_ = new_size;
     ours_ = true;
   }
 }

 // This implementation of clear() deallocates the buffer if we're an owner.
 // That's not strictly necessary; we could just set size_ to 0.
 void UnicodeText::Repr::clear() {
   if (ours_) delete[] data_;
   data_ = NULL;
   size_ = capacity_ = 0;
   ours_ = true;
 }

 void UnicodeText::Repr::Copy(const char* data, int size) {
   resize(size);
   memcpy(data_, data, size);
 }

 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
   if (data == data_) return;  // We already own this memory. (Weird case.)
   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
   data_ = data;
   size_ = size;
   capacity_ = capacity;
   ours_ = true;
 }

 void UnicodeText::Repr::PointTo(const char* data, int size) {
   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
   data_ = const_cast<char*>(data);
   size_ = size;
   capacity_ = size;
   ours_ = false;
 }

 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
   reserve(size_ + byte_length);
   memcpy(data_ + size_, bytes, byte_length);
   size_ += byte_length;
 }

 string UnicodeText::Repr::DebugString() const {
   stringstream ss;

   ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
      << size_ << " capacity=" << capacity_ << " "
      << (ours_ ? "Owned" : "Alias") << "}";

   string result;
   ss >> result;

   return result;
 }


 // *************** UnicodeText ******************

 // ----- Constructors -----

 // Default constructor
 UnicodeText::UnicodeText() {
 }

 // Copy constructor
 UnicodeText::UnicodeText(const UnicodeText& src) {
   Copy(src);
 }

 // Substring constructor
 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
                          const UnicodeText::const_iterator& last) {
   assert(first <= last && "Incompatible iterators");
   repr_.append(first.it_, last.it_ - first.it_);
 }

 string UnicodeText::UTF8Substring(const const_iterator& first,
                                   const const_iterator& last) {
   assert(first <= last && "Incompatible iterators");
   return string(first.it_, last.it_ - first.it_);
 }


 // ----- Copy -----

 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
   if (this != &src) {
     Copy(src);
   }
   return *this;
 }

 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
   repr_.Copy(src.repr_.data_, src.repr_.size_);
   return *this;
 }

 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
   repr_.Copy(buffer, byte_length);
   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
     fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
   }
   return *this;
 }

 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
                                            int byte_length) {
   repr_.Copy(buffer, byte_length);
   return *this;
 }

 // ----- TakeOwnershipOf  -----

 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
                                               int byte_length,
                                               int byte_capacity) {
   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
     fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
   }
   return *this;
 }

 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
                                                     int byte_length,
                                                     int byte_capacity) {
   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
   return *this;
 }

 // ----- PointTo -----

 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
   if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
     repr_.PointTo(buffer, byte_length);
   } else {
     fprintf(stderr, "UTF-8 buffer is not interchange-valid.");
     repr_.Copy(buffer, byte_length);
     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
   }
   return *this;
 }

 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
                                           int byte_length) {
   repr_.PointTo(buffer, byte_length);
   return *this;
 }

 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
   repr_.PointTo(src.repr_.data_, src.repr_.size_);
   return *this;
 }

 UnicodeText& UnicodeText::PointTo(const const_iterator &first,
                                   const const_iterator &last) {
   assert(first <= last && " Incompatible iterators");
   repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
   return *this;
 }

 // ----- Append -----

 UnicodeText& UnicodeText::append(const UnicodeText& u) {
   repr_.append(u.repr_.data_, u.repr_.size_);
   return *this;
 }

 UnicodeText& UnicodeText::append(const const_iterator& first,
                                  const const_iterator& last) {
   assert(first <= last && "Incompatible iterators");
   repr_.append(first.it_, last.it_ - first.it_);
   return *this;
 }

 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
   repr_.append(utf8, len);
   return *this;
 }

 // ----- substring searching -----

 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
                                               const_iterator start_pos) const {
   assert(start_pos.utf8_data() >= utf8_data());
   assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
   return UnsafeFind(look, start_pos);
 }

 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
   return UnsafeFind(look, begin());
 }

 UnicodeText::const_iterator UnicodeText::UnsafeFind(
     const UnicodeText& look, const_iterator start_pos) const {
   // Due to the magic of the UTF8 encoding, searching for a sequence of
   // letters is equivalent to substring search.
   StringPiece searching(utf8_data(), utf8_length());
   StringPiece look_piece(look.utf8_data(), look.utf8_length());
   StringPiece::size_type found =
       searching.find(look_piece, start_pos.utf8_data() - utf8_data());
   if (found == StringPiece::npos) return end();
   return const_iterator(utf8_data() + found);
 }

 bool UnicodeText::HasReplacementChar() const {
   // Equivalent to:
   //   UnicodeText replacement_char;
   //   replacement_char.push_back(0xFFFD);
   //   return find(replacement_char) != end();
   StringPiece searching(utf8_data(), utf8_length());
   StringPiece looking_for("\xEF\xBF\xBD", 3);
   return searching.find(looking_for) != StringPiece::npos;
 }

 // ----- other methods -----

 // Clear operator
 void UnicodeText::clear() {
   repr_.clear();
 }

 // Destructor
 UnicodeText::~UnicodeText() {}


 void UnicodeText::push_back(char32 c) {
   if (UniLib::IsValidCodepoint(c)) {
     char buf[UTFmax];
     int len = runetochar(buf, &c);
     if (UniLib::IsInterchangeValid(buf, len)) {
       repr_.append(buf, len);
     } else {
       fprintf(stderr, "Unicode value 0x%x is not valid for interchange\n", c);
       repr_.append(" ", 1);
     }
   } else {
     fprintf(stderr, "Illegal Unicode value: 0x%x\n", c);
     repr_.append(" ", 1);
   }
 }

 int UnicodeText::size() const {
   return CodepointCount(repr_.data_, repr_.size_);
 }

 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
   if (&lhs == &rhs) return true;
   if (lhs.repr_.size_ != rhs.repr_.size_) return false;
   return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
 }

 string UnicodeText::DebugString() const {
   stringstream ss;

   ss << "{UnicodeText " << hex << this << dec << " chars="
      << size() << " repr=" << repr_.DebugString() << "}";
 #if 0
   return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
                       this,
                       size(),
                       repr_.DebugString().c_str());
 #endif
   string result;
   ss >> result;

   return result;
 }


 // ******************* UnicodeText::const_iterator *********************

 // The implementation of const_iterator would be nicer if it
 // inherited from boost::iterator_facade
 // (http://boost.org/libs/iterator/doc/iterator_facade.html).

 UnicodeText::const_iterator::const_iterator() : it_(0) {}

 UnicodeText::const_iterator::const_iterator(const const_iterator& other)
     : it_(other.it_) {
 }

 UnicodeText::const_iterator&
 UnicodeText::const_iterator::operator=(const const_iterator& other) {
   if (&other != this)
     it_ = other.it_;
   return *this;
 }

 UnicodeText::const_iterator UnicodeText::begin() const {
   return const_iterator(repr_.data_);
 }

 UnicodeText::const_iterator UnicodeText::end() const {
   return const_iterator(repr_.data_ + repr_.size_);
 }

 bool operator<(const UnicodeText::const_iterator& lhs,
                const UnicodeText::const_iterator& rhs) {
   return lhs.it_ < rhs.it_;
 }

 char32 UnicodeText::const_iterator::operator*() const {
   // (We could call chartorune here, but that does some
   // error-checking, and we're guaranteed that our data is valid
   // UTF-8. Also, we expect this routine to be called very often. So
   // for speed, we do the calculation ourselves.)

   // Convert from UTF-8
   uint8 byte1 = static_cast<uint8>(it_[0]);
   if (byte1 < 0x80)
     return byte1;

   uint8 byte2 = static_cast<uint8>(it_[1]);
   if (byte1 < 0xE0)
     return ((byte1 & 0x1F) << 6)
           | (byte2 & 0x3F);

   uint8 byte3 = static_cast<uint8>(it_[2]);
   if (byte1 < 0xF0)
     return ((byte1 & 0x0F) << 12)
          | ((byte2 & 0x3F) << 6)
          |  (byte3 & 0x3F);

   uint8 byte4 = static_cast<uint8>(it_[3]);
   return ((byte1 & 0x07) << 18)
        | ((byte2 & 0x3F) << 12)
        | ((byte3 & 0x3F) << 6)
        |  (byte4 & 0x3F);
 }

 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
   it_ += UniLib::OneCharLen(it_);
   return *this;
 }

 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
   while (UniLib::IsTrailByte(*--it_)) { }
   return *this;
 }

 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
   utf8_output[0] = it_[0];
   if (static_cast<unsigned char>(it_[0]) < 0x80)
     return 1;

   utf8_output[1] = it_[1];
   if (static_cast<unsigned char>(it_[0]) < 0xE0)
     return 2;

   utf8_output[2] = it_[2];
   if (static_cast<unsigned char>(it_[0]) < 0xF0)
     return 3;

   utf8_output[3] = it_[3];
   return 4;
 }


 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
   assert(p != NULL);
   const char* start = utf8_data();
   int len = utf8_length();
   const char* end = start + len;
   assert(p >= start);
   assert(p <= end);
   assert(p == end || !UniLib::IsTrailByte(*p));
   return const_iterator(p);
 }

 string UnicodeText::const_iterator::DebugString() const {
   stringstream ss;

   ss << "{iter " << hex << it_ << "}";
   string result;
   ss >> result;

   return result;
 }

 }  // namespace phonenumbers
 }  // namespace i18n
	// Copyright (C) 2006 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// Author: Jim Meehan

	#include <algorithm>
	#include <sstream>
	#include <cassert>
	#include <cstdio>

	#include "phonenumbers/utf/unicodetext.h"
	#include "phonenumbers/utf/stringpiece.h"
	//#include "utf/stringprintf.h"
	#include "phonenumbers/utf/utf.h"
	#include "phonenumbers/utf/unilib.h"

	namespace i18n {
	namespace phonenumbers {

	using std::stringstream;
	using std::max;
	using std::hex;
	using std::dec;

	static int CodepointDistance(const char* start, const char* end) {
	int n = 0;
	// Increment n on every non-trail-byte.
	for (const char* p = start; p < end; ++p) {
	n += (reinterpret_cast<const signed char>(p) >= -0x40);
	}
	return n;
	}

	static int CodepointCount(const char* utf8, int len) {
	return CodepointDistance(utf8, utf8 + len);
	}

	UnicodeText::const_iterator::difference_type
	distance(const UnicodeText::const_iterator& first,
	const UnicodeText::const_iterator& last) {
	return CodepointDistance(first.it_, last.it_);
	}

	// ---------- Utility ----------

	static int ConvertToInterchangeValid(char* start, int len) {
	// This routine is called only when we've discovered that a UTF-8 buffer
	// that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
	// was not interchange valid. This indicates a bug in the caller, and
	// a LOG(WARNING) is done in that case.
	// This is similar to CoerceToInterchangeValid, but it replaces each
	// structurally valid byte with a space, and each non-interchange
	// character with a space, even when that character requires more
	// than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
	// structurally valid UTF8, but U+FDD0 is not an interchange-valid
	// code point. The result should contain one space, not three.
	//
	// Since the conversion never needs to write more data than it
	// reads, it is safe to change the buffer in place. It returns the
	// number of bytes written.
	char* const in = start;
	char* out = start;
	char* const end = start + len;
	while (start < end) {
	int good = UniLib::SpanInterchangeValid(start, end - start);
	if (good > 0) {
	if (out != start) {
	memmove(out, start, good);
	}
	out += good;
	start += good;
	if (start == end) {
	break;
	}
	}
	// Is the current string invalid UTF8 or just non-interchange UTF8?
	char32 rune;
	int n;
	if (isvalidcharntorune(start, end - start, &rune, &n)) {
	// structurally valid UTF8, but not interchange valid
	start += n; // Skip over the whole character.
	} else { // bad UTF8
	start += 1; // Skip over just one byte
	}
	*out++ = ' ';
	}
	return out - in;
	}


	// ************* Data representation ********

	// Note: the copy constructor is undefined.

	// After reserve(), resize(), or clear(), we're an owner, not an alias.

	void UnicodeText::Repr::reserve(int new_capacity) {
	// If there's already enough capacity, and we're an owner, do nothing.
	if (capacity_ >= new_capacity && ours_) return;

	// Otherwise, allocate a new buffer.
	capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
	char* new_data = new char[capacity_];

	// If there is an old buffer, copy it into the new buffer.
	if (data_) {
	memcpy(new_data, data_, size_);
	if (ours_) delete[] data_; // If we owned the old buffer, free it.
	}
	data_ = new_data;
	ours_ = true; // We own the new buffer.
	// size_ is unchanged.
	}

	void UnicodeText::Repr::resize(int new_size) {
	if (new_size == 0) {
	clear();
	} else {
	if (!ours_ \|\| new_size > capacity_) reserve(new_size);
	// Clear the memory in the expanded part.
	if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
	size_ = new_size;
	ours_ = true;
	}
	}

	// This implementation of clear() deallocates the buffer if we're an owner.
	// That's not strictly necessary; we could just set size_ to 0.
	void UnicodeText::Repr::clear() {
	if (ours_) delete[] data_;
	data_ = NULL;
	size_ = capacity_ = 0;
	ours_ = true;
	}

	void UnicodeText::Repr::Copy(const char* data, int size) {
	resize(size);
	memcpy(data_, data, size);
	}

	void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
	if (data == data_) return; // We already own this memory. (Weird case.)
	if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
	data_ = data;
	size_ = size;
	capacity_ = capacity;
	ours_ = true;
	}

	void UnicodeText::Repr::PointTo(const char* data, int size) {
	if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
	data_ = const_cast<char*>(data);
	size_ = size;
	capacity_ = size;
	ours_ = false;
	}

	void UnicodeText::Repr::append(const char* bytes, int byte_length) {
	reserve(size_ + byte_length);
	memcpy(data_ + size_, bytes, byte_length);
	size_ += byte_length;
	}

	string UnicodeText::Repr::DebugString() const {
	stringstream ss;

	ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
	<< size_ << " capacity=" << capacity_ << " "
	<< (ours_ ? "Owned" : "Alias") << "}";

	string result;
	ss >> result;

	return result;
	}



	// ************* UnicodeText ****************

	// ----- Constructors -----

	// Default constructor
	UnicodeText::UnicodeText() {
	}

	// Copy constructor
	UnicodeText::UnicodeText(const UnicodeText& src) {
	Copy(src);
	}

	// Substring constructor
	UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
	const UnicodeText::const_iterator& last) {
	assert(first <= last && "Incompatible iterators");
	repr_.append(first.it_, last.it_ - first.it_);
	}

	string UnicodeText::UTF8Substring(const const_iterator& first,
	const const_iterator& last) {
	assert(first <= last && "Incompatible iterators");
	return string(first.it_, last.it_ - first.it_);
	}


	// ----- Copy -----

	UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
	if (this != &src) {
	Copy(src);
	}
	return *this;
	}

	UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
	repr_.Copy(src.repr_.data_, src.repr_.size_);
	return *this;
	}

	UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
	repr_.Copy(buffer, byte_length);
	if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
	fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
	repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
	}
	return *this;
	}

	UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
	int byte_length) {
	repr_.Copy(buffer, byte_length);
	return *this;
	}

	// ----- TakeOwnershipOf -----

	UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
	int byte_length,
	int byte_capacity) {
	repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
	if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
	fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
	repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
	}
	return *this;
	}

	UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
	int byte_length,
	int byte_capacity) {
	repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
	return *this;
	}

	// ----- PointTo -----

	UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
	if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
	repr_.PointTo(buffer, byte_length);
	} else {
	fprintf(stderr, "UTF-8 buffer is not interchange-valid.");
	repr_.Copy(buffer, byte_length);
	repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
	}
	return *this;
	}

	UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
	int byte_length) {
	repr_.PointTo(buffer, byte_length);
	return *this;
	}

	UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
	repr_.PointTo(src.repr_.data_, src.repr_.size_);
	return *this;
	}

	UnicodeText& UnicodeText::PointTo(const const_iterator &first,
	const const_iterator &last) {
	assert(first <= last && " Incompatible iterators");
	repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
	return *this;
	}

	// ----- Append -----

	UnicodeText& UnicodeText::append(const UnicodeText& u) {
	repr_.append(u.repr_.data_, u.repr_.size_);
	return *this;
	}

	UnicodeText& UnicodeText::append(const const_iterator& first,
	const const_iterator& last) {
	assert(first <= last && "Incompatible iterators");
	repr_.append(first.it_, last.it_ - first.it_);
	return *this;
	}

	UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
	repr_.append(utf8, len);
	return *this;
	}

	// ----- substring searching -----

	UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
	const_iterator start_pos) const {
	assert(start_pos.utf8_data() >= utf8_data());
	assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
	return UnsafeFind(look, start_pos);
	}

	UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
	return UnsafeFind(look, begin());
	}

	UnicodeText::const_iterator UnicodeText::UnsafeFind(
	const UnicodeText& look, const_iterator start_pos) const {
	// Due to the magic of the UTF8 encoding, searching for a sequence of
	// letters is equivalent to substring search.
	StringPiece searching(utf8_data(), utf8_length());
	StringPiece look_piece(look.utf8_data(), look.utf8_length());
	StringPiece::size_type found =
	searching.find(look_piece, start_pos.utf8_data() - utf8_data());
	if (found == StringPiece::npos) return end();
	return const_iterator(utf8_data() + found);
	}

	bool UnicodeText::HasReplacementChar() const {
	// Equivalent to:
	// UnicodeText replacement_char;
	// replacement_char.push_back(0xFFFD);
	// return find(replacement_char) != end();
	StringPiece searching(utf8_data(), utf8_length());
	StringPiece looking_for("\xEF\xBF\xBD", 3);
	return searching.find(looking_for) != StringPiece::npos;
	}

	// ----- other methods -----

	// Clear operator
	void UnicodeText::clear() {
	repr_.clear();
	}

	// Destructor
	UnicodeText::~UnicodeText() {}


	void UnicodeText::push_back(char32 c) {
	if (UniLib::IsValidCodepoint(c)) {
	char buf[UTFmax];
	int len = runetochar(buf, &c);
	if (UniLib::IsInterchangeValid(buf, len)) {
	repr_.append(buf, len);
	} else {
	fprintf(stderr, "Unicode value 0x%x is not valid for interchange\n", c);
	repr_.append(" ", 1);
	}
	} else {
	fprintf(stderr, "Illegal Unicode value: 0x%x\n", c);
	repr_.append(" ", 1);
	}
	}

	int UnicodeText::size() const {
	return CodepointCount(repr_.data_, repr_.size_);
	}

	bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
	if (&lhs == &rhs) return true;
	if (lhs.repr_.size_ != rhs.repr_.size_) return false;
	return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
	}

	string UnicodeText::DebugString() const {
	stringstream ss;

	ss << "{UnicodeText " << hex << this << dec << " chars="
	<< size() << " repr=" << repr_.DebugString() << "}";
	#if 0
	return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
	this,
	size(),
	repr_.DebugString().c_str());
	#endif
	string result;
	ss >> result;

	return result;
	}


	// ***************** UnicodeText::const_iterator *******************

	// The implementation of const_iterator would be nicer if it
	// inherited from boost::iterator_facade
	// (http://boost.org/libs/iterator/doc/iterator_facade.html).

	UnicodeText::const_iterator::const_iterator() : it_(0) {}

	UnicodeText::const_iterator::const_iterator(const const_iterator& other)
	: it_(other.it_) {
	}

	UnicodeText::const_iterator&
	UnicodeText::const_iterator::operator=(const const_iterator& other) {
	if (&other != this)
	it_ = other.it_;
	return *this;
	}

	UnicodeText::const_iterator UnicodeText::begin() const {
	return const_iterator(repr_.data_);
	}

	UnicodeText::const_iterator UnicodeText::end() const {
	return const_iterator(repr_.data_ + repr_.size_);
	}

	bool operator<(const UnicodeText::const_iterator& lhs,
	const UnicodeText::const_iterator& rhs) {
	return lhs.it_ < rhs.it_;
	}

	char32 UnicodeText::const_iterator::operator*() const {
	// (We could call chartorune here, but that does some
	// error-checking, and we're guaranteed that our data is valid
	// UTF-8. Also, we expect this routine to be called very often. So
	// for speed, we do the calculation ourselves.)

	// Convert from UTF-8
	uint8 byte1 = static_cast<uint8>(it_[0]);
	if (byte1 < 0x80)
	return byte1;

	uint8 byte2 = static_cast<uint8>(it_[1]);
	if (byte1 < 0xE0)
	return ((byte1 & 0x1F) << 6)
	\| (byte2 & 0x3F);

	uint8 byte3 = static_cast<uint8>(it_[2]);
	if (byte1 < 0xF0)
	return ((byte1 & 0x0F) << 12)
	\| ((byte2 & 0x3F) << 6)
	\| (byte3 & 0x3F);

	uint8 byte4 = static_cast<uint8>(it_[3]);
	return ((byte1 & 0x07) << 18)
	\| ((byte2 & 0x3F) << 12)
	\| ((byte3 & 0x3F) << 6)
	\| (byte4 & 0x3F);
	}

	UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
	it_ += UniLib::OneCharLen(it_);
	return *this;
	}

	UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
	while (UniLib::IsTrailByte(*--it_)) { }
	return *this;
	}

	int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
	utf8_output[0] = it_[0];
	if (static_cast<unsigned char>(it_[0]) < 0x80)
	return 1;

	utf8_output[1] = it_[1];
	if (static_cast<unsigned char>(it_[0]) < 0xE0)
	return 2;

	utf8_output[2] = it_[2];
	if (static_cast<unsigned char>(it_[0]) < 0xF0)
	return 3;

	utf8_output[3] = it_[3];
	return 4;
	}


	UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
	assert(p != NULL);
	const char* start = utf8_data();
	int len = utf8_length();
	const char* end = start + len;
	assert(p >= start);
	assert(p <= end);
	assert(p == end \|\| !UniLib::IsTrailByte(*p));
	return const_iterator(p);
	}

	string UnicodeText::const_iterator::DebugString() const {
	stringstream ss;

	ss << "{iter " << hex << it_ << "}";
	string result;
	ss >> result;

	return result;
	}

	} // namespace phonenumbers
	} // namespace i18n