pw_tokenizer/public/pw_tokenizer/token_database.h - platform/external/pigweed - Git at Google

 // Copyright 2020 The Pigweed Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy of
 // the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 // License for the specific language governing permissions and limitations under
 // the License.
 #pragma once

 #include <array>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>

 namespace pw::tokenizer {

 /// Reads entries from a v0 binary token string database. This class does not
 /// copy or modify the contents of the database.
 ///
 /// The v0 token database has two significant shortcomings:
 ///
 ///   - Strings cannot contain null terminators (`\0`). If a string contains a
 ///     `\0`, the database will not work correctly.
 ///   - The domain is not included in entries. All tokens belong to a single
 ///     domain, which must be known independently.
 ///
 /// A v0 binary token database is comprised of a 16-byte header followed by an
 /// array of 8-byte entries and a table of null-terminated strings. The header
 /// specifies the number of entries. Each entry contains information about a
 /// tokenized string: the token and removal date, if any. All fields are little-
 /// endian.
 ///
 /// The token removal date is stored within an unsigned 32-bit integer. It is
 /// stored as `<day> <month> <year>`, where `<day>` and `<month>` are 1 byte
 /// each and `<year>` is two bytes. The fields are set to their maximum value
 /// (`0xFF` or `0xFFFF`) if they are unset. With this format, dates may be
 /// compared naturally as unsigned integers.
 ///
 /// @rst
 ///   ======  ====  =========================
 ///   Header (16 bytes)
 ///   ---------------------------------------
 ///   Offset  Size  Field
 ///   ======  ====  =========================
 ///        0     6  Magic number (``TOKENS``)
 ///        6     2  Version (``00 00``)
 ///        8     4  Entry count
 ///       12     4  Reserved
 ///   ======  ====  =========================
 ///
 ///   ======  ====  ==================================
 ///   Entry (8 bytes)
 ///   ------------------------------------------------
 ///   Offset  Size  Field
 ///   ======  ====  ==================================
 ///        0     4  Token
 ///        4     1  Removal day (1-31, 255 if unset)
 ///        5     1  Removal month (1-12, 255 if unset)
 ///        6     2  Removal year (65535 if unset)
 ///   ======  ====  ==================================
 /// @endrst
 ///
 /// Entries are sorted by token. A string table with a null-terminated string
 /// for each entry in order follows the entries.
 ///
 /// Entries are accessed by iterating over the database. A O(n) `Find` function
 /// is also provided. In typical use, a `TokenDatabase` is preprocessed by a
 /// `pw::tokenizer::Detokenizer` into a `std::unordered_map`.
 class TokenDatabase {
  private:
   // Internal struct that describes how the underlying binary token database
   // stores entries. RawEntries generally should not be used directly. Instead,
   // use an Entry, which contains a pointer to the entry's string.
   struct RawEntry {
     uint32_t token;
     uint32_t date_removed;
   };

   static_assert(sizeof(RawEntry) == 8u);

  public:
   /// An entry in the token database.
   struct Entry {
     /// The token that represents this string.
     uint32_t token;

     /// The date the token and string was removed from the database, or
     /// `0xFFFFFFFF` if it was never removed. Dates are encoded such that
     /// natural integer sorting sorts from oldest to newest dates. The day is
     /// stored an an 8-bit day, 8-bit month, and 16-bit year, packed into a
     /// little-endian `uint32_t`.
     uint32_t date_removed;

     /// The null-terminated string represented by this token.
     const char* string;
   };

   /// Iterator for `TokenDatabase` values.
   class iterator {
    public:
     using difference_type = std::ptrdiff_t;
     using value_type = Entry;
     using pointer = const Entry*;
     using reference = const Entry&;
     using iterator_category = std::forward_iterator_tag;

     constexpr iterator() : entry_{}, raw_(nullptr) {}

     constexpr iterator(const iterator& other) = default;
     constexpr iterator& operator=(const iterator& other) = default;

     constexpr iterator& operator++() {
       raw_ += sizeof(RawEntry);
       ReadRawEntry();
       // Move string_ to the character beyond the next null terminator.
       while (*entry_.string++ != '\0') {
       }
       return *this;
     }
     constexpr iterator operator++(int) {
       iterator previous(*this);
       operator++();
       return previous;
     }
     constexpr bool operator==(const iterator& rhs) const {
       return raw_ == rhs.raw_;
     }
     constexpr bool operator!=(const iterator& rhs) const {
       return raw_ != rhs.raw_;
     }

     constexpr const Entry& operator*() const { return entry_; }

     constexpr const Entry* operator->() const { return &entry_; }

     constexpr difference_type operator-(const iterator& rhs) const {
       return (raw_ - rhs.raw_) / sizeof(RawEntry);
     }

    private:
     friend class TokenDatabase;

     // Constructs a new iterator to a valid entry.
     constexpr iterator(const char* raw_entry, const char* string)
         : entry_{0, 0, string}, raw_{raw_entry} {
       if (raw_entry != string) {  // raw_entry == string if the DB is empty
         ReadRawEntry();
       }
     }

     explicit constexpr iterator(const char* end) : entry_{}, raw_(end) {}

     constexpr void ReadRawEntry() {
       entry_.token = ReadUint32(raw_);
       entry_.date_removed = ReadUint32(raw_ + sizeof(entry_.token));
     }

     Entry entry_;
     const char* raw_;
   };

   using value_type = Entry;
   using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
   using reference = value_type&;
   using const_reference = const value_type&;
   using pointer = const value_type*;
   using const_pointer = const value_type*;
   using const_iterator = iterator;
   using reverse_iterator = std::reverse_iterator<iterator>;
   using const_reverse_iterator = std::reverse_iterator<const_iterator>;

   /// A list of token entries returned from a `Find` operation. This object can
   /// be iterated over or indexed as an array.
   class Entries {
    public:
     constexpr Entries(const iterator& begin, const iterator& end)
         : begin_(begin), end_(end) {}

     // The number of entries in this list.
     constexpr size_type size() const { return end_ - begin_; }

     // True of the list is empty.
     constexpr bool empty() const { return begin_ == end_; }

     // Accesses the specified entry in this set. The index must be less than
     // size(). This operation is O(n) in size().
     Entry operator[](size_type index) const;

     constexpr const iterator& begin() const { return begin_; }
     constexpr const iterator& end() const { return end_; }

    private:
     iterator begin_;
     iterator end_;
   };

   /// Returns true if the provided data is a valid token database. This checks
   /// the magic number (`TOKENS`), version (which must be `0`), and that there
   /// is is one string for each entry in the database. A database with extra
   /// strings or other trailing data is considered valid.
   template <typename ByteArray>
   static constexpr bool IsValid(const ByteArray& bytes) {
     return HasValidHeader(bytes) && EachEntryHasAString(bytes);
   }

   /// Creates a `TokenDatabase` and checks if the provided data is valid at
   /// compile time. Accepts references to constexpr containers (`array`, `span`,
   /// `string_view`, etc.) with static storage duration. For example:
   ///
   ///  @code{.cpp}
   ///
   ///    constexpr char kMyData[] = ...;
   ///    constexpr TokenDatabase db = TokenDatabase::Create<kMyData>();
   ///
   ///  @endcode
   template <const auto& kDatabaseBytes>
   static constexpr TokenDatabase Create() {
     static_assert(
         HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes),
         "Databases must start with a 16-byte header that begins with TOKENS.");

     static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes),
                   "The database must have at least one string for each entry.");

     return TokenDatabase(std::data(kDatabaseBytes));
   }

   /// Creates a `TokenDatabase` from the provided byte array. The array may be a
   /// span, array, or other container type. If the data is not valid, returns a
   /// default-constructed database for which ok() is false.
   ///
   /// Prefer the `Create` overload that takes the data as a template parameter
   /// when possible, since that overload verifies data integrity at compile
   /// time.
   template <typename ByteArray>
   static constexpr TokenDatabase Create(const ByteArray& database_bytes) {
     return IsValid<ByteArray>(database_bytes)
                ? TokenDatabase(std::data(database_bytes))
                : TokenDatabase();  // Invalid database.
   }
   /// Creates a database with no data. `ok()` returns false.
   constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {}

   /// Returns all entries associated with this token. This is `O(n)`.
   Entries Find(uint32_t token) const;

   /// Returns the total number of entries (unique token-string pairs).
   constexpr size_type size() const {
     return (end_.data - begin_.data) / sizeof(RawEntry);
   }

   /// True if this database was constructed with valid data. The database might
   /// be empty, but it has an intact header and a string for each entry.
   constexpr bool ok() const { return begin_.data != nullptr; }

   /// Returns an iterator for the first token entry.
   constexpr iterator begin() const { return iterator(begin_.data, end_.data); }

   /// Returns an iterator for one past the last token entry.
   constexpr iterator end() const { return iterator(end_.data); }

  private:
   struct Header {
     std::array<char, 6> magic;
     uint16_t version;
     uint32_t entry_count;
     uint32_t reserved;
   };

   static_assert(sizeof(Header) == 2 * sizeof(RawEntry));

   template <typename ByteArray>
   static constexpr bool HasValidHeader(const ByteArray& bytes) {
     static_assert(sizeof(*std::data(bytes)) == 1u);

     if (std::size(bytes) < sizeof(Header)) {
       return false;
     }

     // Check the magic number and version.
     for (size_type i = 0; i < kMagicAndVersion.size(); ++i) {
       if (bytes[i] != kMagicAndVersion[i]) {
         return false;
       }
     }

     return true;
   }

   template <typename ByteArray>
   static constexpr bool EachEntryHasAString(const ByteArray& bytes) {
     const size_type entries = ReadEntryCount(std::data(bytes));

     // Check that the data is large enough to have a string table.
     if (std::size(bytes) < StringTable(entries)) {
       return false;
     }

     // Count the strings in the string table.
     size_type string_count = 0;
     for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes);
          ++i) {
       string_count += (*i == '\0') ? 1 : 0;
     }

     // Check that there is at least one string for each entry.
     return string_count >= entries;
   }

   // Reads the number of entries from a database header. Cast to the bytes to
   // uint8_t to avoid sign extension if T is signed.
   template <typename T>
   static constexpr uint32_t ReadEntryCount(const T* header_bytes) {
     const T* bytes = header_bytes + offsetof(Header, entry_count);
     return ReadUint32(bytes);
   }

   template <typename T>
   static constexpr uint32_t ReadUint32(const T* bytes) {
     return static_cast<uint8_t>(bytes[0]) |
            static_cast<uint8_t>(bytes[1]) << 8 |
            static_cast<uint8_t>(bytes[2]) << 16 |
            static_cast<uint8_t>(bytes[3]) << 24;
   }

   // Calculates the offset of the string table.
   static constexpr size_type StringTable(size_type entries) {
     return sizeof(Header) + entries * sizeof(RawEntry);
   }

   // The magic number that starts the table is "TOKENS". The version is encoded
   // next as two bytes.
   static constexpr std::array<char, 8> kMagicAndVersion = {
       'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'};

   template <typename Byte>
   constexpr TokenDatabase(const Byte bytes[])
       : TokenDatabase(bytes + sizeof(Header),
                       bytes + StringTable(ReadEntryCount(bytes))) {
     static_assert(sizeof(Byte) == 1u);
   }

   // It is illegal to reinterpret_cast in constexpr functions, but acceptable to
   // use unions. Instead of using a reinterpret_cast to change the byte pointer
   // to a RawEntry pointer, have a separate overload for each byte pointer type
   // and store them in a union.
   constexpr TokenDatabase(const char* begin, const char* end)
       : begin_{.data = begin}, end_{.data = end} {}

   constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end)
       : begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {}

   constexpr TokenDatabase(const signed char* begin, const signed char* end)
       : begin_{.signed_data = begin}, end_{.signed_data = end} {}

   // Store the beginning and end pointers as a union to avoid breaking constexpr
   // rules for reinterpret_cast.
   union {
     const char* data;
     const unsigned char* unsigned_data;
     const signed char* signed_data;
   } begin_, end_;
 };

 }  // namespace pw::tokenizer
	// Copyright 2020 The Pigweed Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
	// use this file except in compliance with the License. You may obtain a copy of
	// the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
	// License for the specific language governing permissions and limitations under
	// the License.
	#pragma once

	#include <array>
	#include <cstddef>
	#include <cstdint>
	#include <iterator>

	namespace pw::tokenizer {

	/// Reads entries from a v0 binary token string database. This class does not
	/// copy or modify the contents of the database.
	///
	/// The v0 token database has two significant shortcomings:
	///
	/// - Strings cannot contain null terminators (`\0`). If a string contains a
	/// `\0`, the database will not work correctly.
	/// - The domain is not included in entries. All tokens belong to a single
	/// domain, which must be known independently.
	///
	/// A v0 binary token database is comprised of a 16-byte header followed by an
	/// array of 8-byte entries and a table of null-terminated strings. The header
	/// specifies the number of entries. Each entry contains information about a
	/// tokenized string: the token and removal date, if any. All fields are little-
	/// endian.
	///
	/// The token removal date is stored within an unsigned 32-bit integer. It is
	/// stored as `<day> <month> <year>`, where `<day>` and `<month>` are 1 byte
	/// each and `<year>` is two bytes. The fields are set to their maximum value
	/// (`0xFF` or `0xFFFF`) if they are unset. With this format, dates may be
	/// compared naturally as unsigned integers.
	///
	/// @rst
	/// ====== ==== =========================
	/// Header (16 bytes)
	/// ---------------------------------------
	/// Offset Size Field
	/// ====== ==== =========================
	/// 0 6 Magic number (``TOKENS``)
	/// 6 2 Version (``00 00``)
	/// 8 4 Entry count
	/// 12 4 Reserved
	/// ====== ==== =========================
	///
	/// ====== ==== ==================================
	/// Entry (8 bytes)
	/// ------------------------------------------------
	/// Offset Size Field
	/// ====== ==== ==================================
	/// 0 4 Token
	/// 4 1 Removal day (1-31, 255 if unset)
	/// 5 1 Removal month (1-12, 255 if unset)
	/// 6 2 Removal year (65535 if unset)
	/// ====== ==== ==================================
	/// @endrst
	///
	/// Entries are sorted by token. A string table with a null-terminated string
	/// for each entry in order follows the entries.
	///
	/// Entries are accessed by iterating over the database. A O(n) `Find` function
	/// is also provided. In typical use, a `TokenDatabase` is preprocessed by a
	/// `pw::tokenizer::Detokenizer` into a `std::unordered_map`.
	class TokenDatabase {
	private:
	// Internal struct that describes how the underlying binary token database
	// stores entries. RawEntries generally should not be used directly. Instead,
	// use an Entry, which contains a pointer to the entry's string.
	struct RawEntry {
	uint32_t token;
	uint32_t date_removed;
	};

	static_assert(sizeof(RawEntry) == 8u);

	public:
	/// An entry in the token database.
	struct Entry {
	/// The token that represents this string.
	uint32_t token;

	/// The date the token and string was removed from the database, or
	/// `0xFFFFFFFF` if it was never removed. Dates are encoded such that
	/// natural integer sorting sorts from oldest to newest dates. The day is
	/// stored an an 8-bit day, 8-bit month, and 16-bit year, packed into a
	/// little-endian `uint32_t`.
	uint32_t date_removed;

	/// The null-terminated string represented by this token.
	const char* string;
	};

	/// Iterator for `TokenDatabase` values.
	class iterator {
	public:
	using difference_type = std::ptrdiff_t;
	using value_type = Entry;
	using pointer = const Entry*;
	using reference = const Entry&;
	using iterator_category = std::forward_iterator_tag;

	constexpr iterator() : entry_{}, raw_(nullptr) {}

	constexpr iterator(const iterator& other) = default;
	constexpr iterator& operator=(const iterator& other) = default;

	constexpr iterator& operator++() {
	raw_ += sizeof(RawEntry);
	ReadRawEntry();
	// Move string_ to the character beyond the next null terminator.
	while (*entry_.string++ != '\0') {
	}
	return *this;
	}
	constexpr iterator operator++(int) {
	iterator previous(*this);
	operator++();
	return previous;
	}
	constexpr bool operator==(const iterator& rhs) const {
	return raw_ == rhs.raw_;
	}
	constexpr bool operator!=(const iterator& rhs) const {
	return raw_ != rhs.raw_;
	}

	constexpr const Entry& operator*() const { return entry_; }

	constexpr const Entry* operator->() const { return &entry_; }

	constexpr difference_type operator-(const iterator& rhs) const {
	return (raw_ - rhs.raw_) / sizeof(RawEntry);
	}

	private:
	friend class TokenDatabase;

	// Constructs a new iterator to a valid entry.
	constexpr iterator(const char* raw_entry, const char* string)
	: entry_{0, 0, string}, raw_{raw_entry} {
	if (raw_entry != string) { // raw_entry == string if the DB is empty
	ReadRawEntry();
	}
	}

	explicit constexpr iterator(const char* end) : entry_{}, raw_(end) {}

	constexpr void ReadRawEntry() {
	entry_.token = ReadUint32(raw_);
	entry_.date_removed = ReadUint32(raw_ + sizeof(entry_.token));
	}

	Entry entry_;
	const char* raw_;
	};

	using value_type = Entry;
	using size_type = std::size_t;
	using difference_type = std::ptrdiff_t;
	using reference = value_type&;
	using const_reference = const value_type&;
	using pointer = const value_type*;
	using const_pointer = const value_type*;
	using const_iterator = iterator;
	using reverse_iterator = std::reverse_iterator<iterator>;
	using const_reverse_iterator = std::reverse_iterator<const_iterator>;

	/// A list of token entries returned from a `Find` operation. This object can
	/// be iterated over or indexed as an array.
	class Entries {
	public:
	constexpr Entries(const iterator& begin, const iterator& end)
	: begin_(begin), end_(end) {}

	// The number of entries in this list.
	constexpr size_type size() const { return end_ - begin_; }

	// True of the list is empty.
	constexpr bool empty() const { return begin_ == end_; }

	// Accesses the specified entry in this set. The index must be less than
	// size(). This operation is O(n) in size().
	Entry operator[](size_type index) const;

	constexpr const iterator& begin() const { return begin_; }
	constexpr const iterator& end() const { return end_; }

	private:
	iterator begin_;
	iterator end_;
	};

	/// Returns true if the provided data is a valid token database. This checks
	/// the magic number (`TOKENS`), version (which must be `0`), and that there
	/// is is one string for each entry in the database. A database with extra
	/// strings or other trailing data is considered valid.
	template <typename ByteArray>
	static constexpr bool IsValid(const ByteArray& bytes) {
	return HasValidHeader(bytes) && EachEntryHasAString(bytes);
	}

	/// Creates a `TokenDatabase` and checks if the provided data is valid at
	/// compile time. Accepts references to constexpr containers (`array`, `span`,
	/// `string_view`, etc.) with static storage duration. For example:
	///
	/// @code{.cpp}
	///
	/// constexpr char kMyData[] = ...;
	/// constexpr TokenDatabase db = TokenDatabase::Create<kMyData>();
	///
	/// @endcode
	template <const auto& kDatabaseBytes>
	static constexpr TokenDatabase Create() {
	static_assert(
	HasValidHeader<decltype(kDatabaseBytes)>(kDatabaseBytes),
	"Databases must start with a 16-byte header that begins with TOKENS.");

	static_assert(EachEntryHasAString<decltype(kDatabaseBytes)>(kDatabaseBytes),
	"The database must have at least one string for each entry.");

	return TokenDatabase(std::data(kDatabaseBytes));
	}

	/// Creates a `TokenDatabase` from the provided byte array. The array may be a
	/// span, array, or other container type. If the data is not valid, returns a
	/// default-constructed database for which ok() is false.
	///
	/// Prefer the `Create` overload that takes the data as a template parameter
	/// when possible, since that overload verifies data integrity at compile
	/// time.
	template <typename ByteArray>
	static constexpr TokenDatabase Create(const ByteArray& database_bytes) {
	return IsValid<ByteArray>(database_bytes)
	? TokenDatabase(std::data(database_bytes))
	: TokenDatabase(); // Invalid database.
	}
	/// Creates a database with no data. `ok()` returns false.
	constexpr TokenDatabase() : begin_{.data = nullptr}, end_{.data = nullptr} {}

	/// Returns all entries associated with this token. This is `O(n)`.
	Entries Find(uint32_t token) const;

	/// Returns the total number of entries (unique token-string pairs).
	constexpr size_type size() const {
	return (end_.data - begin_.data) / sizeof(RawEntry);
	}

	/// True if this database was constructed with valid data. The database might
	/// be empty, but it has an intact header and a string for each entry.
	constexpr bool ok() const { return begin_.data != nullptr; }

	/// Returns an iterator for the first token entry.
	constexpr iterator begin() const { return iterator(begin_.data, end_.data); }

	/// Returns an iterator for one past the last token entry.
	constexpr iterator end() const { return iterator(end_.data); }

	private:
	struct Header {
	std::array<char, 6> magic;
	uint16_t version;
	uint32_t entry_count;
	uint32_t reserved;
	};

	static_assert(sizeof(Header) == 2 * sizeof(RawEntry));

	template <typename ByteArray>
	static constexpr bool HasValidHeader(const ByteArray& bytes) {
	static_assert(sizeof(*std::data(bytes)) == 1u);

	if (std::size(bytes) < sizeof(Header)) {
	return false;
	}

	// Check the magic number and version.
	for (size_type i = 0; i < kMagicAndVersion.size(); ++i) {
	if (bytes[i] != kMagicAndVersion[i]) {
	return false;
	}
	}

	return true;
	}

	template <typename ByteArray>
	static constexpr bool EachEntryHasAString(const ByteArray& bytes) {
	const size_type entries = ReadEntryCount(std::data(bytes));

	// Check that the data is large enough to have a string table.
	if (std::size(bytes) < StringTable(entries)) {
	return false;
	}

	// Count the strings in the string table.
	size_type string_count = 0;
	for (auto i = std::begin(bytes) + StringTable(entries); i < std::end(bytes);
	++i) {
	string_count += (*i == '\0') ? 1 : 0;
	}

	// Check that there is at least one string for each entry.
	return string_count >= entries;
	}

	// Reads the number of entries from a database header. Cast to the bytes to
	// uint8_t to avoid sign extension if T is signed.
	template <typename T>
	static constexpr uint32_t ReadEntryCount(const T* header_bytes) {
	const T* bytes = header_bytes + offsetof(Header, entry_count);
	return ReadUint32(bytes);
	}

	template <typename T>
	static constexpr uint32_t ReadUint32(const T* bytes) {
	return static_cast<uint8_t>(bytes[0]) \|
	static_cast<uint8_t>(bytes[1]) << 8 \|
	static_cast<uint8_t>(bytes[2]) << 16 \|
	static_cast<uint8_t>(bytes[3]) << 24;
	}

	// Calculates the offset of the string table.
	static constexpr size_type StringTable(size_type entries) {
	return sizeof(Header) + entries * sizeof(RawEntry);
	}

	// The magic number that starts the table is "TOKENS". The version is encoded
	// next as two bytes.
	static constexpr std::array<char, 8> kMagicAndVersion = {
	'T', 'O', 'K', 'E', 'N', 'S', '\0', '\0'};

	template <typename Byte>
	constexpr TokenDatabase(const Byte bytes[])
	: TokenDatabase(bytes + sizeof(Header),
	bytes + StringTable(ReadEntryCount(bytes))) {
	static_assert(sizeof(Byte) == 1u);
	}

	// It is illegal to reinterpret_cast in constexpr functions, but acceptable to
	// use unions. Instead of using a reinterpret_cast to change the byte pointer
	// to a RawEntry pointer, have a separate overload for each byte pointer type
	// and store them in a union.
	constexpr TokenDatabase(const char* begin, const char* end)
	: begin_{.data = begin}, end_{.data = end} {}

	constexpr TokenDatabase(const unsigned char* begin, const unsigned char* end)
	: begin_{.unsigned_data = begin}, end_{.unsigned_data = end} {}

	constexpr TokenDatabase(const signed char* begin, const signed char* end)
	: begin_{.signed_data = begin}, end_{.signed_data = end} {}

	// Store the beginning and end pointers as a union to avoid breaking constexpr
	// rules for reinterpret_cast.
	union {
	const char* data;
	const unsigned char* unsigned_data;
	const signed char* signed_data;
	} begin_, end_;
	};

	} // namespace pw::tokenizer