libs/minikin/Locale.cpp - platform/frameworks/minikin - Git at Google

 /*
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "Locale.h"

 #include <algorithm>

 #include <hb.h>

 #include "minikin/LocaleList.h"

 #include "LocaleListCache.h"
 #include "MinikinInternal.h"
 #include "StringPiece.h"

 namespace minikin {

 constexpr uint32_t FIVE_BITS = 0x1f;

 uint32_t registerLocaleList(const std::string& locales) {
     return LocaleListCache::getId(locales);
 }

 std::string getLocaleString(uint32_t localeId) {
     const LocaleList& localeList = LocaleListCache::getById(localeId);
     std::string out;
     for (size_t i = 0; i < localeList.size(); ++i) {
         if (i != 0) {
             out += ",";
         }
         out += localeList[i].getString();
     }
     return out;
 }

 // Check if a language code supports extension such as emoji and line break etc. according to its
 // subtag
 static bool isSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
     if (bufLen < subtagLen) {
         return false;
     }
     if (strncmp(buf, subtag, subtagLen) != 0) {
         return false;  // no match between two strings
     }
     return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' ||
             buf[subtagLen] == '_');
 }

 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
 // For the region code, the letters must be all digits in three letter case, so the number of
 // possible values are 10. For the language code, the letters must be all small alphabets, so the
 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
 // three letter language code or region code to 15 bits.
 //
 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
 static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
                                      uint8_t threeLetterBase) {
     if (in.length() == 2) {
         return 0x7c00u |  // 0x1fu << 10
                (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase);
     } else {
         return ((uint16_t)(in[0] - threeLetterBase) << 10) |
                (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase);
     }
 }

 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
                                      uint8_t threeLetterBase) {
     uint8_t first = (in >> 10) & FIVE_BITS;
     uint8_t second = (in >> 5) & FIVE_BITS;
     uint8_t third = in & FIVE_BITS;

     if (first == 0x1f) {
         out[0] = second + twoLetterBase;
         out[1] = third + twoLetterBase;
         return 2;
     } else {
         out[0] = first + threeLetterBase;
         out[1] = second + threeLetterBase;
         out[2] = third + threeLetterBase;
         return 3;
     }
 }

 static uint16_t packLanguage(const StringPiece& in) {
     return packLanguageOrRegion(in, 'a', 'a');
 }

 static size_t unpackLanguage(uint16_t in, char* out) {
     return unpackLanguageOrRegion(in, out, 'a', 'a');
 }

 constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
     constexpr char FIRST_LETTER_BASE = 'A';
     constexpr char REST_LETTER_BASE = 'a';
     return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 |
            ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE);
 }

 constexpr uint32_t packScript(uint32_t script) {
     return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
 }

 constexpr uint32_t unpackScript(uint32_t packedScript) {
     constexpr char FIRST_LETTER_BASE = 'A';
     constexpr char REST_LETTER_BASE = 'a';
     const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
     const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
     const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
     const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;

     return first << 24 | second << 16 | third << 8 | fourth;
 }

 static uint16_t packRegion(const StringPiece& in) {
     return packLanguageOrRegion(in, 'A', '0');
 }

 static size_t unpackRegion(uint16_t in, char* out) {
     return unpackLanguageOrRegion(in, out, 'A', '0');
 }

 static inline bool isLowercase(char c) {
     return 'a' <= c && c <= 'z';
 }

 static inline bool isUppercase(char c) {
     return 'A' <= c && c <= 'Z';
 }

 static inline bool isDigit(char c) {
     return '0' <= c && c <= '9';
 }

 // Returns true if the buffer is valid for language code.
 static inline bool isValidLanguageCode(const StringPiece& buffer) {
     if (buffer.length() != 2 && buffer.length() != 3) return false;
     if (!isLowercase(buffer[0])) return false;
     if (!isLowercase(buffer[1])) return false;
     if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
     return true;
 }

 // Returns true if buffer is valid for script code. The length of buffer must be 4.
 static inline bool isValidScriptCode(const StringPiece& buffer) {
     return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
            isLowercase(buffer[2]) && isLowercase(buffer[3]);
 }

 // Returns true if the buffer is valid for region code.
 static inline bool isValidRegionCode(const StringPiece& buffer) {
     return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
            (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
 }

 // Parse BCP 47 language identifier into internal structure
 Locale::Locale(const StringPiece& input) : Locale() {
     SplitIterator it(input, '-');

     StringPiece language = it.next();
     if (isValidLanguageCode(language)) {
         mLanguage = packLanguage(language);
     } else {
         // We don't understand anything other than two-letter or three-letter
         // language codes, so we skip parsing the rest of the string.
         return;
     }

     if (!it.hasNext()) {
         return;  // Language code only.
     }
     StringPiece token = it.next();

     if (isValidScriptCode(token)) {
         mScript = packScript(token[0], token[1], token[2], token[3]);
         mSubScriptBits = scriptToSubScriptBits(mScript);

         if (!it.hasNext()) {
             goto finalize;  // No variant, emoji subtag and region code.
         }
         token = it.next();
     }

     if (isValidRegionCode(token)) {
         mRegion = packRegion(token);

         if (!it.hasNext()) {
             goto finalize;  // No variant or emoji subtag.
         }
         token = it.next();
     }

     if (language == "de") {  // We are only interested in German variants.
         if (token == "1901") {
             mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
         } else if (token == "1996") {
             mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
         }

         if (mVariant != Variant::NO_VARIANT) {
             if (!it.hasNext()) {
                 goto finalize;  // No emoji subtag.
             }

             token = it.next();
         }
     }

     resolveUnicodeExtension(input.data(), input.length());

 finalize:
     if (mEmojiStyle == EmojiStyle::EMPTY) {
         mEmojiStyle = scriptToEmojiStyle(mScript);
     }
 }

 void Locale::resolveUnicodeExtension(const char* buf, size_t length) {
     static const char kPrefix[] = "-u-";
     const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
     if (pos != buf + length) {
         pos += strlen(kPrefix);
         const size_t remainingLength = length - (pos - buf);
         mEmojiStyle = resolveEmojiStyle(pos, remainingLength);
     }
 }

 // static
 // Lookup emoji subtag and determine the emoji style.
 EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
     // 7 is the length of "-u-em-text", which is the shortest emoji subtag,
     // unnecessary comparison can be avoided if total length is smaller than 10.
     const size_t kMinSubtagLength = 7;
     if (length >= kMinSubtagLength) {
         static const char kPrefix[] = "em-";
         const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
         if (pos != buf + length) {  // found
             pos += strlen(kPrefix);
             const size_t remainingLength = length - (pos - buf);
             if (isSubtag(pos, remainingLength, "emoji", 5)) {
                 return EmojiStyle::EMOJI;
             } else if (isSubtag(pos, remainingLength, "text", 4)) {
                 return EmojiStyle::TEXT;
             } else if (isSubtag(pos, remainingLength, "default", 7)) {
                 return EmojiStyle::DEFAULT;
             }
         }
     }
     return EmojiStyle::EMPTY;
 }

 EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
     // If no emoji subtag was provided, resolve the emoji style from script code.
     if (script == packScript('Z', 's', 'y', 'e')) {
         return EmojiStyle::EMOJI;
     } else if (script == packScript('Z', 's', 'y', 'm')) {
         return EmojiStyle::TEXT;
     }
     return EmojiStyle::EMPTY;
 }

 // static
 uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
     uint8_t subScriptBits = 0u;
     switch (script) {
         case packScript('B', 'o', 'p', 'o'):
             subScriptBits = kBopomofoFlag;
             break;
         case packScript('H', 'a', 'n', 'g'):
             subScriptBits = kHangulFlag;
             break;
         case packScript('H', 'a', 'n', 'b'):
             // Bopomofo is almost exclusively used in Taiwan.
             subScriptBits = kHanFlag | kBopomofoFlag;
             break;
         case packScript('H', 'a', 'n', 'i'):
             subScriptBits = kHanFlag;
             break;
         case packScript('H', 'a', 'n', 's'):
             subScriptBits = kHanFlag | kSimplifiedChineseFlag;
             break;
         case packScript('H', 'a', 'n', 't'):
             subScriptBits = kHanFlag | kTraditionalChineseFlag;
             break;
         case packScript('H', 'i', 'r', 'a'):
             subScriptBits = kHiraganaFlag;
             break;
         case packScript('H', 'r', 'k', 't'):
             subScriptBits = kKatakanaFlag | kHiraganaFlag;
             break;
         case packScript('J', 'p', 'a', 'n'):
             subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
             break;
         case packScript('K', 'a', 'n', 'a'):
             subScriptBits = kKatakanaFlag;
             break;
         case packScript('K', 'o', 'r', 'e'):
             subScriptBits = kHanFlag | kHangulFlag;
             break;
     }
     return subScriptBits;
 }

 std::string Locale::getString() const {
     char buf[32];
     int i = buildLocaleString(buf);
     return std::string(buf, i);
 }

 std::string Locale::getStringWithLineBreakOption(LineBreakStyle lbStyle,
                                                  LineBreakWordStyle lbWordStyle) const {
     char buf[48];
     int i = buildLocaleString(buf);

     // Add line break unicode extension.
     if (lbStyle != LineBreakStyle::None || lbWordStyle != LineBreakWordStyle::None) {
         buf[i++] = '-';
         buf[i++] = 'u';
     }

     if (lbStyle != LineBreakStyle::None) {
         buf[i++] = '-';
         buf[i++] = 'l';
         buf[i++] = 'b';
         buf[i++] = '-';
         switch (lbStyle) {
             case LineBreakStyle::Loose:
                 buf[i++] = 'l';
                 buf[i++] = 'o';
                 buf[i++] = 'o';
                 buf[i++] = 's';
                 buf[i++] = 'e';
                 break;
             case LineBreakStyle::Normal:
                 buf[i++] = 'n';
                 buf[i++] = 'o';
                 buf[i++] = 'r';
                 buf[i++] = 'm';
                 buf[i++] = 'a';
                 buf[i++] = 'l';
                 break;
             case LineBreakStyle::Strict:
                 buf[i++] = 's';
                 buf[i++] = 't';
                 buf[i++] = 'r';
                 buf[i++] = 'i';
                 buf[i++] = 'c';
                 buf[i++] = 't';
                 break;
             default:
                 MINIKIN_ASSERT(false, "Must not reached.");
         }
     }

     if (lbWordStyle != LineBreakWordStyle::None) {
         buf[i++] = '-';
         buf[i++] = 'l';
         buf[i++] = 'w';
         buf[i++] = '-';
         switch (lbWordStyle) {
             case LineBreakWordStyle::Phrase:
                 buf[i++] = 'p';
                 buf[i++] = 'h';
                 buf[i++] = 'r';
                 buf[i++] = 'a';
                 buf[i++] = 's';
                 buf[i++] = 'e';
                 break;
             default:
                 MINIKIN_ASSERT(false, "Must not reached.");
         }
     }
     return std::string(buf, i);
 }

 int Locale::buildLocaleString(char* buf) const {
     size_t i;
     if (mLanguage == NO_LANGUAGE) {
         buf[0] = 'u';
         buf[1] = 'n';
         buf[2] = 'd';
         i = 3;
     } else {
         i = unpackLanguage(mLanguage, buf);
     }
     if (mScript != NO_SCRIPT) {
         uint32_t rawScript = unpackScript(mScript);
         buf[i++] = '-';
         buf[i++] = (rawScript >> 24) & 0xFFu;
         buf[i++] = (rawScript >> 16) & 0xFFu;
         buf[i++] = (rawScript >> 8) & 0xFFu;
         buf[i++] = rawScript & 0xFFu;
     }
     if (mRegion != NO_REGION) {
         buf[i++] = '-';
         i += unpackRegion(mRegion, buf + i);
     }
     if (mVariant != Variant::NO_VARIANT) {
         buf[i++] = '-';
         buf[i++] = '1';
         buf[i++] = '9';
         switch (mVariant) {
             case Variant::GERMAN_1901_ORTHOGRAPHY:
                 buf[i++] = '0';
                 buf[i++] = '1';
                 break;
             case Variant::GERMAN_1996_ORTHOGRAPHY:
                 buf[i++] = '9';
                 buf[i++] = '6';
                 break;
             default:
                 MINIKIN_ASSERT(false, "Must not reached.");
         }
     }
     return i;
 }

 Locale Locale::getPartialLocale(SubtagBits bits) const {
     Locale subLocale;
     if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
         subLocale.mLanguage = mLanguage;
     } else {
         subLocale.mLanguage = packLanguage("und");
     }
     if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
         subLocale.mScript = mScript;
         subLocale.mSubScriptBits = mSubScriptBits;
     }
     if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
         subLocale.mRegion = mRegion;
     }
     if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
         subLocale.mVariant = mVariant;
     }
     if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
         subLocale.mEmojiStyle = mEmojiStyle;
     }
     return subLocale;
 }

 bool Locale::isEqualScript(const Locale& other) const {
     return other.mScript == mScript;
 }

 // static
 bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
     return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
 }

 bool Locale::supportsScript(uint32_t script) const {
     static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
                   "The Minikin script and HarfBuzz hb_script_t have different encodings.");
     uint32_t packedScript = packScript(script);
     if (packedScript == mScript) return true;
     return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
 }

 int Locale::calcScoreFor(const LocaleList& supported) const {
     bool languageScriptMatch = false;
     bool subtagMatch = false;
     bool scriptMatch = false;

     for (size_t i = 0; i < supported.size(); ++i) {
         if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
             subtagMatch = true;
             if (mLanguage == supported[i].mLanguage) {
                 return 4;
             }
         }
         if (isEqualScript(supported[i]) ||
             supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
             scriptMatch = true;
             if (mLanguage == supported[i].mLanguage) {
                 languageScriptMatch = true;
             }
         }
     }

     if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
         scriptMatch = true;
         if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
             return 3;
         }
     }

     if (languageScriptMatch) {
         return 3;
     } else if (subtagMatch) {
         return 2;
     } else if (scriptMatch) {
         return 1;
     }
     return 0;
 }

 static hb_language_t buildHbLanguage(const Locale& locale) {
     return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
                                 : HB_LANGUAGE_INVALID;
 }

 LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
     mIsAllTheSameLocale = true;
     mUnionOfSubScriptBits = 0u;
     mHbLangs.reserve(mLocales.size());
     mEmojiStyle = EmojiStyle::EMPTY;
     const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
     for (const Locale& locale : mLocales) {
         mUnionOfSubScriptBits |= locale.mSubScriptBits;
         if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
             mIsAllTheSameLocale = false;
         }
         mHbLangs.push_back(buildHbLanguage(locale));
         if (mEmojiStyle == EmojiStyle::EMPTY) {
             mEmojiStyle = locale.getEmojiStyle();
         }
     }
 }

 }  // namespace minikin
	/*
	* Copyright (C) 2015 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "Locale.h"

	#include <algorithm>

	#include <hb.h>

	#include "minikin/LocaleList.h"

	#include "LocaleListCache.h"
	#include "MinikinInternal.h"
	#include "StringPiece.h"

	namespace minikin {

	constexpr uint32_t FIVE_BITS = 0x1f;

	uint32_t registerLocaleList(const std::string& locales) {
	return LocaleListCache::getId(locales);
	}

	std::string getLocaleString(uint32_t localeId) {
	const LocaleList& localeList = LocaleListCache::getById(localeId);
	std::string out;
	for (size_t i = 0; i < localeList.size(); ++i) {
	if (i != 0) {
	out += ",";
	}
	out += localeList[i].getString();
	}
	return out;
	}

	// Check if a language code supports extension such as emoji and line break etc. according to its
	// subtag
	static bool isSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
	if (bufLen < subtagLen) {
	return false;
	}
	if (strncmp(buf, subtag, subtagLen) != 0) {
	return false; // no match between two strings
	}
	return (bufLen == subtagLen \|\| buf[subtagLen] == '\0' \|\| buf[subtagLen] == '-' \|\|
	buf[subtagLen] == '_');
	}

	// Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
	// For the region code, the letters must be all digits in three letter case, so the number of
	// possible values are 10. For the language code, the letters must be all small alphabets, so the
	// number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
	// three letter language code or region code to 15 bits.
	//
	// In case of two letter code, use fullbit(0x1f) for the first letter instead.
	static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
	uint8_t threeLetterBase) {
	if (in.length() == 2) {
	return 0x7c00u \| // 0x1fu << 10
	(uint16_t)(in[0] - twoLetterBase) << 5 \| (uint16_t)(in[1] - twoLetterBase);
	} else {
	return ((uint16_t)(in[0] - threeLetterBase) << 10) \|
	(uint16_t)(in[1] - threeLetterBase) << 5 \| (uint16_t)(in[2] - threeLetterBase);
	}
	}

	static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
	uint8_t threeLetterBase) {
	uint8_t first = (in >> 10) & FIVE_BITS;
	uint8_t second = (in >> 5) & FIVE_BITS;
	uint8_t third = in & FIVE_BITS;

	if (first == 0x1f) {
	out[0] = second + twoLetterBase;
	out[1] = third + twoLetterBase;
	return 2;
	} else {
	out[0] = first + threeLetterBase;
	out[1] = second + threeLetterBase;
	out[2] = third + threeLetterBase;
	return 3;
	}
	}

	static uint16_t packLanguage(const StringPiece& in) {
	return packLanguageOrRegion(in, 'a', 'a');
	}

	static size_t unpackLanguage(uint16_t in, char* out) {
	return unpackLanguageOrRegion(in, out, 'a', 'a');
	}

	constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
	constexpr char FIRST_LETTER_BASE = 'A';
	constexpr char REST_LETTER_BASE = 'a';
	return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) \| (uint32_t)(c2 - REST_LETTER_BASE) << 10 \|
	((uint32_t)(c3 - REST_LETTER_BASE) << 5) \| (uint32_t)(c4 - REST_LETTER_BASE);
	}

	constexpr uint32_t packScript(uint32_t script) {
	return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
	}

	constexpr uint32_t unpackScript(uint32_t packedScript) {
	constexpr char FIRST_LETTER_BASE = 'A';
	constexpr char REST_LETTER_BASE = 'a';
	const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
	const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
	const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
	const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;

	return first << 24 \| second << 16 \| third << 8 \| fourth;
	}

	static uint16_t packRegion(const StringPiece& in) {
	return packLanguageOrRegion(in, 'A', '0');
	}

	static size_t unpackRegion(uint16_t in, char* out) {
	return unpackLanguageOrRegion(in, out, 'A', '0');
	}

	static inline bool isLowercase(char c) {
	return 'a' <= c && c <= 'z';
	}

	static inline bool isUppercase(char c) {
	return 'A' <= c && c <= 'Z';
	}

	static inline bool isDigit(char c) {
	return '0' <= c && c <= '9';
	}

	// Returns true if the buffer is valid for language code.
	static inline bool isValidLanguageCode(const StringPiece& buffer) {
	if (buffer.length() != 2 && buffer.length() != 3) return false;
	if (!isLowercase(buffer[0])) return false;
	if (!isLowercase(buffer[1])) return false;
	if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
	return true;
	}

	// Returns true if buffer is valid for script code. The length of buffer must be 4.
	static inline bool isValidScriptCode(const StringPiece& buffer) {
	return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
	isLowercase(buffer[2]) && isLowercase(buffer[3]);
	}

	// Returns true if the buffer is valid for region code.
	static inline bool isValidRegionCode(const StringPiece& buffer) {
	return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) \|\|
	(buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
	}

	// Parse BCP 47 language identifier into internal structure
	Locale::Locale(const StringPiece& input) : Locale() {
	SplitIterator it(input, '-');

	StringPiece language = it.next();
	if (isValidLanguageCode(language)) {
	mLanguage = packLanguage(language);
	} else {
	// We don't understand anything other than two-letter or three-letter
	// language codes, so we skip parsing the rest of the string.
	return;
	}

	if (!it.hasNext()) {
	return; // Language code only.
	}
	StringPiece token = it.next();

	if (isValidScriptCode(token)) {
	mScript = packScript(token[0], token[1], token[2], token[3]);
	mSubScriptBits = scriptToSubScriptBits(mScript);

	if (!it.hasNext()) {
	goto finalize; // No variant, emoji subtag and region code.
	}
	token = it.next();
	}

	if (isValidRegionCode(token)) {
	mRegion = packRegion(token);

	if (!it.hasNext()) {
	goto finalize; // No variant or emoji subtag.
	}
	token = it.next();
	}

	if (language == "de") { // We are only interested in German variants.
	if (token == "1901") {
	mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
	} else if (token == "1996") {
	mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
	}

	if (mVariant != Variant::NO_VARIANT) {
	if (!it.hasNext()) {
	goto finalize; // No emoji subtag.
	}

	token = it.next();
	}
	}

	resolveUnicodeExtension(input.data(), input.length());

	finalize:
	if (mEmojiStyle == EmojiStyle::EMPTY) {
	mEmojiStyle = scriptToEmojiStyle(mScript);
	}
	}

	void Locale::resolveUnicodeExtension(const char* buf, size_t length) {
	static const char kPrefix[] = "-u-";
	const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
	if (pos != buf + length) {
	pos += strlen(kPrefix);
	const size_t remainingLength = length - (pos - buf);
	mEmojiStyle = resolveEmojiStyle(pos, remainingLength);
	}
	}

	// static
	// Lookup emoji subtag and determine the emoji style.
	EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
	// 7 is the length of "-u-em-text", which is the shortest emoji subtag,
	// unnecessary comparison can be avoided if total length is smaller than 10.
	const size_t kMinSubtagLength = 7;
	if (length >= kMinSubtagLength) {
	static const char kPrefix[] = "em-";
	const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
	if (pos != buf + length) { // found
	pos += strlen(kPrefix);
	const size_t remainingLength = length - (pos - buf);
	if (isSubtag(pos, remainingLength, "emoji", 5)) {
	return EmojiStyle::EMOJI;
	} else if (isSubtag(pos, remainingLength, "text", 4)) {
	return EmojiStyle::TEXT;
	} else if (isSubtag(pos, remainingLength, "default", 7)) {
	return EmojiStyle::DEFAULT;
	}
	}
	}
	return EmojiStyle::EMPTY;
	}

	EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
	// If no emoji subtag was provided, resolve the emoji style from script code.
	if (script == packScript('Z', 's', 'y', 'e')) {
	return EmojiStyle::EMOJI;
	} else if (script == packScript('Z', 's', 'y', 'm')) {
	return EmojiStyle::TEXT;
	}
	return EmojiStyle::EMPTY;
	}

	// static
	uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
	uint8_t subScriptBits = 0u;
	switch (script) {
	case packScript('B', 'o', 'p', 'o'):
	subScriptBits = kBopomofoFlag;
	break;
	case packScript('H', 'a', 'n', 'g'):
	subScriptBits = kHangulFlag;
	break;
	case packScript('H', 'a', 'n', 'b'):
	// Bopomofo is almost exclusively used in Taiwan.
	subScriptBits = kHanFlag \| kBopomofoFlag;
	break;
	case packScript('H', 'a', 'n', 'i'):
	subScriptBits = kHanFlag;
	break;
	case packScript('H', 'a', 'n', 's'):
	subScriptBits = kHanFlag \| kSimplifiedChineseFlag;
	break;
	case packScript('H', 'a', 'n', 't'):
	subScriptBits = kHanFlag \| kTraditionalChineseFlag;
	break;
	case packScript('H', 'i', 'r', 'a'):
	subScriptBits = kHiraganaFlag;
	break;
	case packScript('H', 'r', 'k', 't'):
	subScriptBits = kKatakanaFlag \| kHiraganaFlag;
	break;
	case packScript('J', 'p', 'a', 'n'):
	subScriptBits = kHanFlag \| kKatakanaFlag \| kHiraganaFlag;
	break;
	case packScript('K', 'a', 'n', 'a'):
	subScriptBits = kKatakanaFlag;
	break;
	case packScript('K', 'o', 'r', 'e'):
	subScriptBits = kHanFlag \| kHangulFlag;
	break;
	}
	return subScriptBits;
	}

	std::string Locale::getString() const {
	char buf[32];
	int i = buildLocaleString(buf);
	return std::string(buf, i);
	}

	std::string Locale::getStringWithLineBreakOption(LineBreakStyle lbStyle,
	LineBreakWordStyle lbWordStyle) const {
	char buf[48];
	int i = buildLocaleString(buf);

	// Add line break unicode extension.
	if (lbStyle != LineBreakStyle::None \|\| lbWordStyle != LineBreakWordStyle::None) {
	buf[i++] = '-';
	buf[i++] = 'u';
	}

	if (lbStyle != LineBreakStyle::None) {
	buf[i++] = '-';
	buf[i++] = 'l';
	buf[i++] = 'b';
	buf[i++] = '-';
	switch (lbStyle) {
	case LineBreakStyle::Loose:
	buf[i++] = 'l';
	buf[i++] = 'o';
	buf[i++] = 'o';
	buf[i++] = 's';
	buf[i++] = 'e';
	break;
	case LineBreakStyle::Normal:
	buf[i++] = 'n';
	buf[i++] = 'o';
	buf[i++] = 'r';
	buf[i++] = 'm';
	buf[i++] = 'a';
	buf[i++] = 'l';
	break;
	case LineBreakStyle::Strict:
	buf[i++] = 's';
	buf[i++] = 't';
	buf[i++] = 'r';
	buf[i++] = 'i';
	buf[i++] = 'c';
	buf[i++] = 't';
	break;
	default:
	MINIKIN_ASSERT(false, "Must not reached.");
	}
	}

	if (lbWordStyle != LineBreakWordStyle::None) {
	buf[i++] = '-';
	buf[i++] = 'l';
	buf[i++] = 'w';
	buf[i++] = '-';
	switch (lbWordStyle) {
	case LineBreakWordStyle::Phrase:
	buf[i++] = 'p';
	buf[i++] = 'h';
	buf[i++] = 'r';
	buf[i++] = 'a';
	buf[i++] = 's';
	buf[i++] = 'e';
	break;
	default:
	MINIKIN_ASSERT(false, "Must not reached.");
	}
	}
	return std::string(buf, i);
	}

	int Locale::buildLocaleString(char* buf) const {
	size_t i;
	if (mLanguage == NO_LANGUAGE) {
	buf[0] = 'u';
	buf[1] = 'n';
	buf[2] = 'd';
	i = 3;
	} else {
	i = unpackLanguage(mLanguage, buf);
	}
	if (mScript != NO_SCRIPT) {
	uint32_t rawScript = unpackScript(mScript);
	buf[i++] = '-';
	buf[i++] = (rawScript >> 24) & 0xFFu;
	buf[i++] = (rawScript >> 16) & 0xFFu;
	buf[i++] = (rawScript >> 8) & 0xFFu;
	buf[i++] = rawScript & 0xFFu;
	}
	if (mRegion != NO_REGION) {
	buf[i++] = '-';
	i += unpackRegion(mRegion, buf + i);
	}
	if (mVariant != Variant::NO_VARIANT) {
	buf[i++] = '-';
	buf[i++] = '1';
	buf[i++] = '9';
	switch (mVariant) {
	case Variant::GERMAN_1901_ORTHOGRAPHY:
	buf[i++] = '0';
	buf[i++] = '1';
	break;
	case Variant::GERMAN_1996_ORTHOGRAPHY:
	buf[i++] = '9';
	buf[i++] = '6';
	break;
	default:
	MINIKIN_ASSERT(false, "Must not reached.");
	}
	}
	return i;
	}

	Locale Locale::getPartialLocale(SubtagBits bits) const {
	Locale subLocale;
	if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
	subLocale.mLanguage = mLanguage;
	} else {
	subLocale.mLanguage = packLanguage("und");
	}
	if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
	subLocale.mScript = mScript;
	subLocale.mSubScriptBits = mSubScriptBits;
	}
	if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
	subLocale.mRegion = mRegion;
	}
	if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
	subLocale.mVariant = mVariant;
	}
	if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
	subLocale.mEmojiStyle = mEmojiStyle;
	}
	return subLocale;
	}

	bool Locale::isEqualScript(const Locale& other) const {
	return other.mScript == mScript;
	}

	// static
	bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
	return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
	}

	bool Locale::supportsScript(uint32_t script) const {
	static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
	"The Minikin script and HarfBuzz hb_script_t have different encodings.");
	uint32_t packedScript = packScript(script);
	if (packedScript == mScript) return true;
	return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
	}

	int Locale::calcScoreFor(const LocaleList& supported) const {
	bool languageScriptMatch = false;
	bool subtagMatch = false;
	bool scriptMatch = false;

	for (size_t i = 0; i < supported.size(); ++i) {
	if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
	subtagMatch = true;
	if (mLanguage == supported[i].mLanguage) {
	return 4;
	}
	}
	if (isEqualScript(supported[i]) \|\|
	supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
	scriptMatch = true;
	if (mLanguage == supported[i].mLanguage) {
	languageScriptMatch = true;
	}
	}
	}

	if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
	scriptMatch = true;
	if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
	return 3;
	}
	}

	if (languageScriptMatch) {
	return 3;
	} else if (subtagMatch) {
	return 2;
	} else if (scriptMatch) {
	return 1;
	}
	return 0;
	}

	static hb_language_t buildHbLanguage(const Locale& locale) {
	return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
	: HB_LANGUAGE_INVALID;
	}

	LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
	mIsAllTheSameLocale = true;
	mUnionOfSubScriptBits = 0u;
	mHbLangs.reserve(mLocales.size());
	mEmojiStyle = EmojiStyle::EMPTY;
	const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
	for (const Locale& locale : mLocales) {
	mUnionOfSubScriptBits \|= locale.mSubScriptBits;
	if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
	mIsAllTheSameLocale = false;
	}
	mHbLangs.push_back(buildHbLanguage(locale));
	if (mEmojiStyle == EmojiStyle::EMPTY) {
	mEmojiStyle = locale.getEmojiStyle();
	}
	}
	}

	} // namespace minikin