| // Copyright (C) 2006 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Author: Jim Meehan |
| |
| #ifndef UTIL_UTF8_UNICODETEXT_H__ |
| #define UTIL_UTF8_UNICODETEXT_H__ |
| |
| #include <iterator> |
| #include <string> |
| #include <utility> |
| #include "phonenumbers/base/basictypes.h" |
| |
| namespace i18n { |
| namespace phonenumbers { |
| |
| using std::string; |
| using std::bidirectional_iterator_tag; |
| using std::pair; |
| |
| // ***************************** UnicodeText ************************** |
| // |
| // A UnicodeText object is a container for a sequence of Unicode |
| // codepoint values. It has default, copy, and assignment constructors. |
| // Data can be appended to it from another UnicodeText, from |
| // iterators, or from a single codepoint. |
| // |
| // The internal representation of the text is UTF-8. Since UTF-8 is a |
| // variable-width format, UnicodeText does not provide random access |
| // to the text, and changes to the text are permitted only at the end. |
| // |
| // The UnicodeText class defines a const_iterator. The dereferencing |
| // operator (*) returns a codepoint (char32). The iterator is a |
| // bidirectional, read-only iterator. It becomes invalid if the text |
| // is changed. |
| // |
| // There are methods for appending and retrieving UTF-8 data directly. |
| // The 'utf8_data' method returns a const char* that contains the |
| // UTF-8-encoded version of the text; 'utf8_length' returns the number |
| // of bytes in the UTF-8 data. An iterator's 'get' method stores up to |
| // 4 bytes of UTF-8 data in a char array and returns the number of |
| // bytes that it stored. |
| // |
| // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, |
| // 0x10FFFF], but UnicodeText has the additional restriction that it |
| // can contain only those characters that are valid for interchange on |
| // the Web. This excludes all of the control codes except for carriage |
| // return, line feed, and horizontal tab. It also excludes |
| // non-characters, but codepoints that are in the Private Use regions |
| // are allowed, as are codepoints that are unassigned. (See the |
| // Unicode reference for details.) The function UniLib::IsInterchangeValid |
| // can be used as a test for this property. |
| // |
| // UnicodeTexts are safe. Every method that constructs or modifies a |
| // UnicodeText tests for interchange-validity, and will substitute a |
| // space for the invalid data. Such cases are reported via |
| // LOG(WARNING). |
| // |
| // MEMORY MANAGEMENT: copy, take ownership, or point to |
| // |
| // A UnicodeText is either an "owner", meaning that it owns the memory |
| // for the data buffer and will free it when the UnicodeText is |
| // destroyed, or it is an "alias", meaning that it does not. |
| // |
| // There are three methods for storing UTF-8 data in a UnicodeText: |
| // |
| // CopyUTF8(buffer, len) copies buffer. |
| // |
| // TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. |
| // |
| // PointToUTF8(buffer, size) creates an alias pointing to buffer. |
| // |
| // All three methods perform a validity check on the buffer. There are |
| // private, "unsafe" versions of these functions that bypass the |
| // validity check. They are used internally and by friend-functions |
| // that are handling UTF-8 data that has already been validated. |
| // |
| // The purpose of an alias is to avoid making an unnecessary copy of a |
| // UTF-8 buffer while still providing access to the Unicode values |
| // within that text through iterators or the fast scanners that are |
| // based on UTF-8 state tables. The lifetime of an alias must not |
| // exceed the lifetime of the buffer from which it was constructed. |
| // |
| // The semantics of an alias might be described as "copy on write or |
| // repair." The source data is never modified. If push_back() or |
| // append() is called on an alias, a copy of the data will be created, |
| // and the UnicodeText will become an owner. If clear() is called on |
| // an alias, it becomes an (empty) owner. |
| // |
| // The copy constructor and the assignment operator produce an owner. |
| // That is, after direct initialization ("UnicodeText x(y);") or copy |
| // initialization ("UnicodeText x = y;") x will be an owner, even if y |
| // was an alias. The assignment operator ("x = y;") also produces an |
| // owner unless x and y are the same object and y is an alias. |
| // |
| // Aliases should be used with care. If the source from which an alias |
| // was created is freed, or if the contents are changed, while the |
| // alias is still in use, fatal errors could result. But it can be |
| // quite useful to have a UnicodeText "window" through which to see a |
| // UTF-8 buffer without having to pay the price of making a copy. |
| // |
| // UTILITIES |
| // |
| // The interfaces in util/utf8/public/textutils.h provide higher-level |
| // utilities for dealing with UnicodeTexts, including routines for |
| // creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or |
| // strings, creating strings from UnicodeTexts, normalizing text for |
| // efficient matching or display, and others. |
| |
| class UnicodeText { |
| public: |
| class const_iterator; |
| |
| typedef char32 value_type; |
| |
| // Constructors. These always produce owners. |
| UnicodeText(); // Create an empty text. |
| UnicodeText(const UnicodeText& src); // copy constructor |
| // Construct a substring (copies the data). |
| UnicodeText(const const_iterator& first, const const_iterator& last); |
| |
| // Assignment operator. This copies the data and produces an owner |
| // unless this == &src, e.g., "x = x;", which is a no-op. |
| UnicodeText& operator=(const UnicodeText& src); |
| |
| // x.Copy(y) copies the data from y into x. |
| UnicodeText& Copy(const UnicodeText& src); |
| inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } |
| |
| // x.PointTo(y) changes x so that it points to y's data. |
| // It does not copy y or take ownership of y's data. |
| UnicodeText& PointTo(const UnicodeText& src); |
| UnicodeText& PointTo(const const_iterator& first, |
| const const_iterator& last); |
| |
| ~UnicodeText(); |
| |
| void clear(); // Clear text. |
| bool empty() { return repr_.size_ == 0; } // Test if text is empty. |
| |
| // Add a codepoint to the end of the text. |
| // If the codepoint is not interchange-valid, add a space instead |
| // and log a warning. |
| void push_back(char32 codepoint); |
| |
| // Generic appending operation. |
| // iterator_traits<ForwardIterator>::value_type must be implicitly |
| // convertible to char32. Typical uses of this method might include: |
| // char32 chars[] = {0x1, 0x2, ...}; |
| // vector<char32> more_chars = ...; |
| // utext.append(chars, chars+arraysize(chars)); |
| // utext.append(more_chars.begin(), more_chars.end()); |
| template<typename ForwardIterator> |
| UnicodeText& append(ForwardIterator first, const ForwardIterator last) { |
| while (first != last) { push_back(*first++); } |
| return *this; |
| } |
| |
| // A specialization of the generic append() method. |
| UnicodeText& append(const const_iterator& first, const const_iterator& last); |
| |
| // An optimization of append(source.begin(), source.end()). |
| UnicodeText& append(const UnicodeText& source); |
| |
| int size() const; // the number of Unicode characters (codepoints) |
| |
| friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); |
| friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); |
| |
| class const_iterator { |
| typedef const_iterator CI; |
| public: |
| typedef bidirectional_iterator_tag iterator_category; |
| typedef char32 value_type; |
| typedef ptrdiff_t difference_type; |
| typedef void pointer; // (Not needed.) |
| typedef const char32 reference; // (Needed for const_reverse_iterator) |
| |
| // Iterators are default-constructible. |
| const_iterator(); |
| |
| // It's safe to make multiple passes over a UnicodeText. |
| const_iterator(const const_iterator& other); |
| const_iterator& operator=(const const_iterator& other); |
| |
| char32 operator*() const; // Dereference |
| |
| const_iterator& operator++(); // Advance (++iter) |
| const_iterator operator++(int) { // (iter++) |
| const_iterator result(*this); |
| ++*this; |
| return result; |
| } |
| |
| const_iterator& operator--(); // Retreat (--iter) |
| const_iterator operator--(int) { // (iter--) |
| const_iterator result(*this); |
| --*this; |
| return result; |
| } |
| |
| // We love relational operators. |
| friend bool operator==(const CI& lhs, const CI& rhs) { |
| return lhs.it_ == rhs.it_; } |
| friend bool operator!=(const CI& lhs, const CI& rhs) { |
| return !(lhs == rhs); } |
| friend bool operator<(const CI& lhs, const CI& rhs); |
| friend bool operator>(const CI& lhs, const CI& rhs) { |
| return rhs < lhs; } |
| friend bool operator<=(const CI& lhs, const CI& rhs) { |
| return !(rhs < lhs); } |
| friend bool operator>=(const CI& lhs, const CI& rhs) { |
| return !(lhs < rhs); } |
| |
| friend difference_type distance(const CI& first, const CI& last); |
| |
| // UTF-8-specific methods |
| // Store the UTF-8 encoding of the current codepoint into buf, |
| // which must be at least 4 bytes long. Return the number of |
| // bytes written. |
| int get_utf8(char* buf) const; |
| // Return the iterator's pointer into the UTF-8 data. |
| const char* utf8_data() const { return it_; } |
| |
| string DebugString() const; |
| |
| private: |
| friend class UnicodeText; |
| friend class UnicodeTextUtils; |
| friend class UTF8StateTableProperty; |
| explicit const_iterator(const char* it) : it_(it) {} |
| |
| const char* it_; |
| }; |
| |
| const_iterator begin() const; |
| const_iterator end() const; |
| |
| class const_reverse_iterator : public std::reverse_iterator<const_iterator> { |
| public: |
| const_reverse_iterator(const_iterator it) : |
| std::reverse_iterator<const_iterator>(it) {} |
| const char* utf8_data() const { |
| const_iterator tmp_it = base(); |
| return (--tmp_it).utf8_data(); |
| } |
| int get_utf8(char* buf) const { |
| const_iterator tmp_it = base(); |
| return (--tmp_it).get_utf8(buf); |
| } |
| }; |
| const_reverse_iterator rbegin() const { |
| return const_reverse_iterator(end()); |
| } |
| const_reverse_iterator rend() const { |
| return const_reverse_iterator(begin()); |
| } |
| |
| // Substring searching. Returns the beginning of the first |
| // occurrence of "look", or end() if not found. |
| const_iterator find(const UnicodeText& look, const_iterator start_pos) const; |
| // Equivalent to find(look, begin()) |
| const_iterator find(const UnicodeText& look) const; |
| |
| // Returns whether this contains the character U+FFFD. This can |
| // occur, for example, if the input to Encodings::Decode() had byte |
| // sequences that were invalid in the source encoding. |
| bool HasReplacementChar() const; |
| |
| // UTF-8-specific methods |
| // |
| // Return the data, length, and capacity of UTF-8-encoded version of |
| // the text. Length and capacity are measured in bytes. |
| const char* utf8_data() const { return repr_.data_; } |
| int utf8_length() const { return repr_.size_; } |
| int utf8_capacity() const { return repr_.capacity_; } |
| |
| // Return the UTF-8 data as a string. |
| static string UTF8Substring(const const_iterator& first, |
| const const_iterator& last); |
| |
| // There are three methods for initializing a UnicodeText from UTF-8 |
| // data. They vary in details of memory management. In all cases, |
| // the data is tested for interchange-validity. If it is not |
| // interchange-valid, a LOG(WARNING) is issued, and each |
| // structurally invalid byte and each interchange-invalid codepoint |
| // is replaced with a space. |
| |
| // x.CopyUTF8(buf, len) copies buf into x. |
| UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); |
| |
| // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of |
| // buf. buf is not copied. |
| UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, |
| int byte_length, |
| int byte_capacity); |
| |
| // x.PointToUTF8(buf,len) changes x so that it points to buf |
| // ("becomes an alias"). It does not take ownership or copy buf. |
| // If the buffer is not valid, this has the same effect as |
| // CopyUTF8(utf8_buffer, byte_length). |
| UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); |
| |
| // Occasionally it is necessary to use functions that operate on the |
| // pointer returned by utf8_data(). MakeIterator(p) provides a way |
| // to get back to the UnicodeText level. It uses CHECK to ensure |
| // that p is a pointer within this object's UTF-8 data, and that it |
| // points to the beginning of a character. |
| const_iterator MakeIterator(const char* p) const; |
| |
| string DebugString() const; |
| |
| private: |
| friend class const_iterator; |
| friend class UnicodeTextUtils; |
| |
| class Repr { // A byte-string. |
| public: |
| char* data_; |
| int size_; |
| int capacity_; |
| bool ours_; // Do we own data_? |
| |
| Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} |
| ~Repr() { if (ours_) delete[] data_; } |
| |
| void clear(); |
| void reserve(int capacity); |
| void resize(int size); |
| |
| void append(const char* bytes, int byte_length); |
| void Copy(const char* data, int size); |
| void TakeOwnershipOf(char* data, int size, int capacity); |
| void PointTo(const char* data, int size); |
| |
| string DebugString() const; |
| |
| private: |
| Repr& operator=(const Repr&); |
| Repr(const Repr& other); |
| }; |
| |
| Repr repr_; |
| |
| // UTF-8-specific private methods. |
| // These routines do not perform a validity check when compiled |
| // in opt mode. |
| // It is an error to call these methods with UTF-8 data that |
| // is not interchange-valid. |
| // |
| UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); |
| UnicodeText& UnsafeTakeOwnershipOfUTF8( |
| char* utf8_buffer, int byte_length, int byte_capacity); |
| UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); |
| UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); |
| const_iterator UnsafeFind(const UnicodeText& look, |
| const_iterator start_pos) const; |
| }; |
| |
| bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); |
| |
| inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { |
| return !(lhs == rhs); |
| } |
| |
| // UnicodeTextRange is a pair of iterators, useful for specifying text |
| // segments. If the iterators are ==, the segment is empty. |
| typedef pair<UnicodeText::const_iterator, |
| UnicodeText::const_iterator> UnicodeTextRange; |
| |
| inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { |
| return r.first == r.second; |
| } |
| |
| |
| // *************************** Utilities ************************* |
| |
| // A factory function for creating a UnicodeText from a buffer of |
| // UTF-8 data. The new UnicodeText takes ownership of the buffer. (It |
| // is an "owner.") |
| // |
| // Each byte that is structurally invalid will be replaced with a |
| // space. Each codepoint that is interchange-invalid will also be |
| // replaced with a space, even if the codepoint was represented with a |
| // multibyte sequence in the UTF-8 data. |
| // |
| inline UnicodeText MakeUnicodeTextAcceptingOwnership( |
| char* utf8_buffer, int byte_length, int byte_capacity) { |
| return UnicodeText().TakeOwnershipOfUTF8( |
| utf8_buffer, byte_length, byte_capacity); |
| } |
| |
| // A factory function for creating a UnicodeText from a buffer of |
| // UTF-8 data. The new UnicodeText does not take ownership of the |
| // buffer. (It is an "alias.") |
| // |
| inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( |
| const char* utf8_buffer, int byte_length) { |
| return UnicodeText().PointToUTF8(utf8_buffer, byte_length); |
| } |
| |
| // Create a UnicodeText from a UTF-8 string or buffer. |
| // |
| // If do_copy is true, then a copy of the string is made. The copy is |
| // owned by the resulting UnicodeText object and will be freed when |
| // the object is destroyed. This UnicodeText object is referred to |
| // as an "owner." |
| // |
| // If do_copy is false, then no copy is made. The resulting |
| // UnicodeText object does NOT take ownership of the string; in this |
| // case, the lifetime of the UnicodeText object must not exceed the |
| // lifetime of the string. This Unicodetext object is referred to as |
| // an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. |
| // |
| // If the input string does not contain valid UTF-8, then a copy is |
| // made (as if do_copy were true) and coerced to valid UTF-8 by |
| // replacing each invalid byte with a space. |
| // |
| inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, |
| bool do_copy) { |
| UnicodeText t; |
| if (do_copy) { |
| t.CopyUTF8(utf8_buf, len); |
| } else { |
| t.PointToUTF8(utf8_buf, len); |
| } |
| return t; |
| } |
| |
| inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { |
| return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); |
| } |
| |
| inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { |
| return UTF8ToUnicodeText(utf8_buf, len, true); |
| } |
| inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { |
| return UTF8ToUnicodeText(utf8_string, true); |
| } |
| |
| // Return a string containing the UTF-8 encoded version of all the |
| // Unicode characters in t. |
| inline string UnicodeTextToUTF8(const UnicodeText& t) { |
| return string(t.utf8_data(), t.utf8_length()); |
| } |
| |
| } // namespace phonenumbers |
| } // namespace i18n |
| |
| #endif // UTIL_UTF8_UNICODETEXT_H__ |