utils/utf8/unicodetext.h - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_

 #include <iterator>
 #include <string>
 #include <utility>

 #include "utils/base/integral_types.h"

 namespace libtextclassifier3 {

 // ***************************** UnicodeText **************************
 //
 // A UnicodeText object is a wrapper around a sequence of Unicode
 // codepoint values that allows iteration over these values.
 //
 // The internal representation of the text is UTF-8. Since UTF-8 is a
 // variable-width format, UnicodeText does not provide random access
 // to the text, and changes to the text are permitted only at the end.
 //
 // The UnicodeText class defines a const_iterator. The dereferencing
 // operator (*) returns a codepoint (int32). The iterator is a
 // read-only iterator. It becomes invalid if the text is changed.
 //
 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
 // 0x10FFFF], but UnicodeText has the additional restriction that it
 // can contain only those characters that are valid for interchange on
 // the Web. This excludes all of the control codes except for carriage
 // return, line feed, and horizontal tab.  It also excludes
 // non-characters, but codepoints that are in the Private Use regions
 // are allowed, as are codepoints that are unassigned. (See the
 // Unicode reference for details.)
 //
 // MEMORY MANAGEMENT:
 //
 // PointToUTF8(buffer, size) creates an alias pointing to buffer.
 //
 // The purpose of an alias is to avoid making an unnecessary copy of a
 // UTF-8 buffer while still providing access to the Unicode values
 // within that text through iterators. The lifetime of an alias must not
 // exceed the lifetime of the buffer from which it was constructed.
 //
 // Aliases should be used with care. If the source from which an alias
 // was created is freed, or if the contents are changed, while the
 // alias is still in use, fatal errors could result. But it can be
 // quite useful to have a UnicodeText "window" through which to see a
 // UTF-8 buffer without having to pay the price of making a copy.

 class UnicodeText {
  public:
   class const_iterator;

   UnicodeText();  // Create an empty text.
   UnicodeText(const UnicodeText& src);
   UnicodeText& operator=(UnicodeText&& src);
   ~UnicodeText();

   class const_iterator {
     typedef const_iterator CI;

    public:
     typedef std::input_iterator_tag iterator_category;
     typedef char32 value_type;
     typedef int difference_type;
     typedef void pointer;            // (Not needed.)
     typedef const char32 reference;  // (Needed for const_reverse_iterator)

     // Iterators are default-constructible.
     const_iterator();

     // It's safe to make multiple passes over a UnicodeText.
     const_iterator& operator=(const const_iterator& other);

     char32 operator*() const;  // Dereference

     const_iterator& operator++();     // Advance (++iter)
     const_iterator operator++(int) {  // (iter++)
       const_iterator result(*this);
       ++*this;
       return result;
     }

     const_iterator& operator--();     // Retreat (--iter)
     const_iterator operator--(int) {  // (iter--)
       const_iterator result(*this);
       --*this;
       return result;
     }

     friend bool operator==(const CI& lhs, const CI& rhs) {
       return lhs.it_ == rhs.it_;
     }
     friend bool operator!=(const CI& lhs, const CI& rhs) {
       return !(lhs == rhs);
     }
     friend bool operator<(const CI& lhs, const CI& rhs);
     friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
     friend bool operator<=(const CI& lhs, const CI& rhs) {
       return !(rhs < lhs);
     }
     friend bool operator>=(const CI& lhs, const CI& rhs) {
       return !(lhs < rhs);
     }

     int utf8_length() const {
       if (it_[0] < 0x80) {
         return 1;
       } else if (it_[0] < 0xE0) {
         return 2;
       } else if (it_[0] < 0xF0) {
         return 3;
       } else {
         return 4;
       }
     }
     const char* utf8_data() const { return it_; }

    private:
     friend class UnicodeText;
     explicit const_iterator(const char* it) : it_(it) {}

     const char* it_;
   };

   const_iterator begin() const;
   const_iterator end() const;

   // Gets pointer to the underlying utf8 data.
   const char* data() const;

   // Gets length (in bytes) of the underlying utf8 data.
   int size_bytes() const;

   // Computes length (in number of Unicode codepoints) of the underlying utf8
   // data.
   // NOTE: Complexity O(n).
   int size_codepoints() const;

   bool empty() const;

   // Checks whether the underlying data is valid utf8 data.
   bool is_valid() const;

   bool operator==(const UnicodeText& other) const;

   // x.PointToUTF8(buf,len) changes x so that it points to buf
   // ("becomes an alias"). It does not take ownership or copy buf.
   // This function assumes that the input is interchange valid UTF8.
   UnicodeText& Copy(const UnicodeText& src);
   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);

   // Calling this may invalidate pointers to underlying data.
   UnicodeText& AppendUTF8(const char* utf8, int len);
   UnicodeText& push_back(char32 ch);
   void clear();

   std::string ToUTF8String() const;
   static std::string UTF8Substring(const const_iterator& first,
                                    const const_iterator& last);

  private:
   friend class const_iterator;

   class Repr {  // A byte-string.
    public:
     char* data_;
     int size_;
     int capacity_;
     bool ours_;  // Do we own data_?

     Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
     Repr& operator=(Repr&& src);
     ~Repr() {
       if (ours_) delete[] data_;
     }

     void clear();
     void reserve(int capacity);
     void resize(int size);

     void append(const char* bytes, int byte_length);
     void Copy(const char* data, int size);
     void PointTo(const char* data, int size);

    private:
     Repr& operator=(const Repr&);
     Repr(const Repr& other);
   };

   Repr repr_;
 };

 typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
     UnicodeTextRange;

 // NOTE: The following are needed to avoid implicit conversion from char* to
 // std::string, or from ::string to std::string, because if this happens it
 // often results in invalid memory access to a temporary object created during
 // such conversion (if do_copy == false).
 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy);
 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy);
 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy);
 UnicodeText UTF8ToUnicodeText(const std::string& str);

 }  // namespace libtextclassifier3

 #endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
	#define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_

	#include <iterator>
	#include <string>
	#include <utility>

	#include "utils/base/integral_types.h"

	namespace libtextclassifier3 {

	// *************************** UnicodeText ************************
	//
	// A UnicodeText object is a wrapper around a sequence of Unicode
	// codepoint values that allows iteration over these values.
	//
	// The internal representation of the text is UTF-8. Since UTF-8 is a
	// variable-width format, UnicodeText does not provide random access
	// to the text, and changes to the text are permitted only at the end.
	//
	// The UnicodeText class defines a const_iterator. The dereferencing
	// operator (*) returns a codepoint (int32). The iterator is a
	// read-only iterator. It becomes invalid if the text is changed.
	//
	// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
	// 0x10FFFF], but UnicodeText has the additional restriction that it
	// can contain only those characters that are valid for interchange on
	// the Web. This excludes all of the control codes except for carriage
	// return, line feed, and horizontal tab. It also excludes
	// non-characters, but codepoints that are in the Private Use regions
	// are allowed, as are codepoints that are unassigned. (See the
	// Unicode reference for details.)
	//
	// MEMORY MANAGEMENT:
	//
	// PointToUTF8(buffer, size) creates an alias pointing to buffer.
	//
	// The purpose of an alias is to avoid making an unnecessary copy of a
	// UTF-8 buffer while still providing access to the Unicode values
	// within that text through iterators. The lifetime of an alias must not
	// exceed the lifetime of the buffer from which it was constructed.
	//
	// Aliases should be used with care. If the source from which an alias
	// was created is freed, or if the contents are changed, while the
	// alias is still in use, fatal errors could result. But it can be
	// quite useful to have a UnicodeText "window" through which to see a
	// UTF-8 buffer without having to pay the price of making a copy.

	class UnicodeText {
	public:
	class const_iterator;

	UnicodeText(); // Create an empty text.
	UnicodeText(const UnicodeText& src);
	UnicodeText& operator=(UnicodeText&& src);
	~UnicodeText();

	class const_iterator {
	typedef const_iterator CI;

	public:
	typedef std::input_iterator_tag iterator_category;
	typedef char32 value_type;
	typedef int difference_type;
	typedef void pointer; // (Not needed.)
	typedef const char32 reference; // (Needed for const_reverse_iterator)

	// Iterators are default-constructible.
	const_iterator();

	// It's safe to make multiple passes over a UnicodeText.
	const_iterator& operator=(const const_iterator& other);

	char32 operator*() const; // Dereference

	const_iterator& operator++(); // Advance (++iter)
	const_iterator operator++(int) { // (iter++)
	const_iterator result(*this);
	++*this;
	return result;
	}

	const_iterator& operator--(); // Retreat (--iter)
	const_iterator operator--(int) { // (iter--)
	const_iterator result(*this);
	--*this;
	return result;
	}

	friend bool operator==(const CI& lhs, const CI& rhs) {
	return lhs.it_ == rhs.it_;
	}
	friend bool operator!=(const CI& lhs, const CI& rhs) {
	return !(lhs == rhs);
	}
	friend bool operator<(const CI& lhs, const CI& rhs);
	friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
	friend bool operator<=(const CI& lhs, const CI& rhs) {
	return !(rhs < lhs);
	}
	friend bool operator>=(const CI& lhs, const CI& rhs) {
	return !(lhs < rhs);
	}

	int utf8_length() const {
	if (it_[0] < 0x80) {
	return 1;
	} else if (it_[0] < 0xE0) {
	return 2;
	} else if (it_[0] < 0xF0) {
	return 3;
	} else {
	return 4;
	}
	}
	const char* utf8_data() const { return it_; }

	private:
	friend class UnicodeText;
	explicit const_iterator(const char* it) : it_(it) {}

	const char* it_;
	};

	const_iterator begin() const;
	const_iterator end() const;

	// Gets pointer to the underlying utf8 data.
	const char* data() const;

	// Gets length (in bytes) of the underlying utf8 data.
	int size_bytes() const;

	// Computes length (in number of Unicode codepoints) of the underlying utf8
	// data.
	// NOTE: Complexity O(n).
	int size_codepoints() const;

	bool empty() const;

	// Checks whether the underlying data is valid utf8 data.
	bool is_valid() const;

	bool operator==(const UnicodeText& other) const;

	// x.PointToUTF8(buf,len) changes x so that it points to buf
	// ("becomes an alias"). It does not take ownership or copy buf.
	// This function assumes that the input is interchange valid UTF8.
	UnicodeText& Copy(const UnicodeText& src);
	UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
	UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);

	// Calling this may invalidate pointers to underlying data.
	UnicodeText& AppendUTF8(const char* utf8, int len);
	UnicodeText& push_back(char32 ch);
	void clear();

	std::string ToUTF8String() const;
	static std::string UTF8Substring(const const_iterator& first,
	const const_iterator& last);

	private:
	friend class const_iterator;

	class Repr { // A byte-string.
	public:
	char* data_;
	int size_;
	int capacity_;
	bool ours_; // Do we own data_?

	Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
	Repr& operator=(Repr&& src);
	~Repr() {
	if (ours_) delete[] data_;
	}

	void clear();
	void reserve(int capacity);
	void resize(int size);

	void append(const char* bytes, int byte_length);
	void Copy(const char* data, int size);
	void PointTo(const char* data, int size);

	private:
	Repr& operator=(const Repr&);
	Repr(const Repr& other);
	};

	Repr repr_;
	};

	typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
	UnicodeTextRange;

	// NOTE: The following are needed to avoid implicit conversion from char* to
	// std::string, or from ::string to std::string, because if this happens it
	// often results in invalid memory access to a temporary object created during
	// such conversion (if do_copy == false).
	UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy);
	UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy);
	UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy);
	UnicodeText UTF8ToUnicodeText(const std::string& str);

	} // namespace libtextclassifier3

	#endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_