Adding address formatting functionality - on a single line, on multiple lines, and combining multiple street address lines into one, in a language-dependent fashion.
diff --git a/cpp/include/libaddressinput/address_formatter.h b/cpp/include/libaddressinput/address_formatter.h
new file mode 100644
index 0000000..56d2c45
--- /dev/null
+++ b/cpp/include/libaddressinput/address_formatter.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Utility functions for formatting the addresses represented as AddressData.
+
+#ifndef I18N_ADDRESSINPUT_ADDRESS_FORMATTER_H
+#define I18N_ADDRESSINPUT_ADDRESS_FORMATTER_H
+
+#include <libaddressinput/address_formatter.h>
+
+#include <string>
+#include <vector>
+
+namespace i18n {
+namespace addressinput {
+
+struct AddressData;
+
+// Formats the address onto multiple lines. This formats the address in national
+// format; without the country.
+void GetFormattedNationalAddress(
+ const AddressData& address_data, std::vector<std::string>* lines);
+
+// Formats the address as a single line. This formats the address in national
+// format; without the country.
+void GetFormattedNationalAddressLine(
+ const AddressData& address_data, std::string* line);
+
+// Formats the street-level part of an address as a single line. For example,
+// two lines of "Apt 1", "10 Red St." will be concatenated in a
+// language-appropriate way, to give something like "Apt 1, 10 Red St".
+void GetStreetAddressLinesAsSingleLine(
+ const AddressData& address_data, std::string* line);
+
+} // namespace addressinput
+} // namespace i18n
+
+#endif // I18N_ADDRESSINPUT_ADDRESS_FORMATTER_H
diff --git a/cpp/libaddressinput.gypi b/cpp/libaddressinput.gypi
index cfb63c4..1c2f2f2 100644
--- a/cpp/libaddressinput.gypi
+++ b/cpp/libaddressinput.gypi
@@ -17,6 +17,7 @@
'src/address_data.cc',
'src/address_field.cc',
'src/address_field_util.cc',
+ 'src/address_formatter.cc',
'src/address_problem.cc',
'src/address_ui.cc',
'src/address_validator.cc',
@@ -46,6 +47,7 @@
'test/address_data_test.cc',
'test/address_field_test.cc',
'test/address_field_util_test.cc',
+ 'test/address_formatter_test.cc',
'test/address_problem_test.cc',
'test/address_ui_test.cc',
'test/address_validator_test.cc',
diff --git a/cpp/src/address_formatter.cc b/cpp/src/address_formatter.cc
new file mode 100644
index 0000000..872d381
--- /dev/null
+++ b/cpp/src/address_formatter.cc
@@ -0,0 +1,191 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include <libaddressinput/address_data.h>
+#include <libaddressinput/address_field.h>
+
+#include "format_element.h"
+#include "language.h"
+#include "region_data_constants.h"
+#include "rule.h"
+
+namespace i18n {
+namespace addressinput {
+
+namespace {
+
+const char kCommaSeparator[] = ", ";
+const char kSpaceSeparator[] = " ";
+const char kArabicCommaSeparator[] = "\xD8\x8C" " "; /* "، " */
+
+const char* kLanguagesThatUseSpace[] = {
+ "th",
+ "ko"
+};
+
+const char* kLanguagesThatHaveNoSeparator[] = {
+ "ja",
+ "zh" // All Chinese variants.
+};
+
+// This data is based on CLDR, for languages that are in official use in some
+// country, where Arabic is the most likely script tag.
+// TODO: Consider supporting variants such as tr-Arab by detecting the script
+// code.
+const char* kLanguagesThatUseAnArabicComma[] = {
+ "ar",
+ "az",
+ "fa",
+ "kk",
+ "ku",
+ "ky",
+ "ps",
+ "tg",
+ "tk",
+ "ur",
+ "uz"
+};
+
+// Case insensitive matcher for language tags.
+struct LanguageMatcher {
+ LanguageMatcher(const std::string& tag) : tag(tag) {}
+ std::string tag;
+ bool operator() (const std::string& s) {
+ return strcasecmp(tag.c_str(), s.c_str()) == 0;
+ }
+};
+
+std::string GetLineSeparatorForLanguage(const std::string& language_tag) {
+ Language address_language(language_tag);
+
+ // First deal with explicit script tags.
+ if (address_language.has_latin_script) {
+ return kCommaSeparator;
+ }
+
+ // Now guess something appropriate based on the base language.
+ std::string base_language = address_language.base;
+ if (std::find_if(kLanguagesThatUseSpace,
+ kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace),
+ LanguageMatcher(base_language)) !=
+ kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace)) {
+ return kSpaceSeparator;
+ } else if (std::find_if(kLanguagesThatHaveNoSeparator,
+ kLanguagesThatHaveNoSeparator +
+ arraysize(kLanguagesThatHaveNoSeparator),
+ LanguageMatcher(base_language)) !=
+ kLanguagesThatHaveNoSeparator +
+ arraysize(kLanguagesThatHaveNoSeparator)) {
+ return "";
+ } else if (std::find_if(kLanguagesThatUseAnArabicComma,
+ kLanguagesThatUseAnArabicComma +
+ arraysize(kLanguagesThatUseAnArabicComma),
+ LanguageMatcher(base_language)) !=
+ kLanguagesThatUseAnArabicComma +
+ arraysize(kLanguagesThatUseAnArabicComma)) {
+ return kArabicCommaSeparator;
+ }
+ // Either the language is a latin-script language, or no language was
+ // specified. In the latter case we still return ", " as the most common
+ // separator in use. In countries that don't use this, e.g. Thailand,
+ // addresses are often written in latin script where this would still be
+ // appropriate, so this is a reasonable default in the absence of information.
+ return kCommaSeparator;
+}
+
+void CombineLinesForLanguage(
+ const std::vector<std::string>& lines, const std::string& language_tag,
+ std::string *line) {
+ if (lines.size() > 0) {
+ line->assign(lines[0]);
+ }
+ std::string separator = GetLineSeparatorForLanguage(language_tag);
+ for (std::vector<std::string>::const_iterator it = lines.begin() + 1;
+ it < lines.end(); ++it) {
+ line->append(separator);
+ line->append(*it);
+ }
+}
+
+} // namespace
+
+void GetFormattedNationalAddress(
+ const AddressData& address_data, std::vector<std::string>* lines) {
+ assert(lines != NULL);
+ lines->clear();
+
+ Rule rule;
+ rule.CopyFrom(Rule::GetDefault());
+ // TODO: Eventually, we should get the best rule for this country and
+ // language, rather than just for the country.
+ rule.ParseSerializedRule(RegionDataConstants::GetRegionData(
+ address_data.region_code));
+
+ Language language(address_data.language_code);
+
+ // If latinized rules are available and the |language_code| of this address is
+ // explicitly tagged as being Latin, then use the latinized formatting rules.
+ const std::vector<FormatElement>& format =
+ language.has_latin_script && !rule.GetLatinFormat().empty()
+ ? rule.GetLatinFormat() : rule.GetFormat();
+
+ std::string line;
+ for (size_t i = 0; i < format.size(); ++i) {
+ FormatElement element = format[i];
+ if (element.IsNewline()) {
+ if (!line.empty()) {
+ lines->push_back(line);
+ line.clear();
+ }
+ } else if (element.IsField()) {
+ AddressField field = element.GetField();
+ if (field == STREET_ADDRESS) {
+ // The field "street address" represents the street address lines of an
+ // address, so there can be multiple values.
+ if (!line.empty()) {
+ lines->push_back(line);
+ line.clear();
+ }
+ lines->insert(lines->end(), address_data.address_line.begin(),
+ address_data.address_line.end());
+ } else {
+ line.append(address_data.GetFieldValue(field));
+ }
+ } else {
+ line.append(element.GetLiteral());
+ }
+ }
+ if (!line.empty()) {
+ lines->push_back(line);
+ }
+}
+
+void GetFormattedNationalAddressLine(
+ const AddressData& address_data, std::string* line) {
+ std::vector<std::string> address_lines;
+ GetFormattedNationalAddress(address_data, &address_lines);
+ CombineLinesForLanguage(address_lines, address_data.language_code, line);
+}
+
+void GetStreetAddressLinesAsSingleLine(
+ const AddressData& address_data, std::string* line) {
+ CombineLinesForLanguage(
+ address_data.address_line, address_data.language_code, line);
+}
+
+} // namespace addressinput
+} // namespace i18n
diff --git a/cpp/src/format_element.h b/cpp/src/format_element.h
index 0fc3d56..e37a188 100644
--- a/cpp/src/format_element.h
+++ b/cpp/src/format_element.h
@@ -44,7 +44,7 @@
// Returns true if this element represents a new line.
bool IsNewline() const { return literal_ == "\n"; }
- const AddressField& GetField() const { return field_; }
+ AddressField GetField() const { return field_; }
const std::string& GetLiteral() const { return literal_; }
bool operator==(const FormatElement& other) const;
diff --git a/cpp/test/address_formatter_test.cc b/cpp/test/address_formatter_test.cc
new file mode 100644
index 0000000..ae5b847
--- /dev/null
+++ b/cpp/test/address_formatter_test.cc
@@ -0,0 +1,201 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language_code governing permissions and
+// limitations under the License.
+
+#include <libaddressinput/address_formatter.h>
+
+#include <libaddressinput/address_data.h>
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+using i18n::addressinput::AddressData;
+using i18n::addressinput::GetStreetAddressLinesAsSingleLine;
+using i18n::addressinput::GetFormattedNationalAddress;
+using i18n::addressinput::GetFormattedNationalAddressLine;
+
+TEST(AddressFormatterTest, GetStreetAddressLinesAsSingleLine_1Line) {
+ AddressData address;
+ address.region_code = "US"; // Not used.
+ address.address_line.push_back("Line 1");
+
+ std::string result;
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ EXPECT_EQ("Line 1", result);
+
+ // Setting the language_code, with one line, shouldn't affect anything.
+ address.language_code = "en";
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ EXPECT_EQ("Line 1", result);
+
+ address.language_code = "zh-Hans";
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ EXPECT_EQ("Line 1", result);
+}
+
+TEST(AddressFormatterTest, GetStreetAddressLinesAsSingleLine_2Lines) {
+ AddressData address;
+ address.region_code = "US"; // Not used.
+ address.address_line.push_back("Line 1");
+ address.address_line.push_back("Line 2");
+
+ std::string result;
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ // Default separator if no language_code specified: ", "
+ EXPECT_EQ("Line 1, Line 2", result);
+
+ address.language_code = "en";
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ EXPECT_EQ("Line 1, Line 2", result);
+
+ address.language_code = "zh-Hans";
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ // Chinese has no separator.
+ EXPECT_EQ("Line 1Line 2", result);
+
+ address.language_code = "ko";
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ EXPECT_EQ("Line 1 Line 2", result);
+
+ address.language_code = "ar";
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ EXPECT_EQ("Line 1\xD8\x8C" " Line 2", result); // Arabic comma.
+}
+
+TEST(AddressFormatterTest, GetStreetAddressLinesAsSingleLine_5Lines) {
+ AddressData address;
+ address.region_code = "US"; // Not used.
+ address.address_line.push_back("Line 1");
+ address.address_line.push_back("Line 2");
+ address.address_line.push_back("Line 3");
+ address.address_line.push_back("Line 4");
+ address.address_line.push_back("Line 5");
+ address.language_code = "fr";
+
+ std::string result;
+ GetStreetAddressLinesAsSingleLine(address, &result);
+ EXPECT_EQ(result, "Line 1, Line 2, Line 3, Line 4, Line 5");
+}
+
+TEST(AddressFormatterTest, GetFormattedNationalAddressLocalLanguage) {
+ AddressData address;
+ address.region_code = "NZ";
+ address.address_line.push_back("Rotopapa");
+ address.address_line.push_back("Irwell 3RD");
+ address.postal_code = "8704";
+ address.locality = "Leeston";
+
+ std::vector<std::string> expected;
+ expected.push_back("Rotopapa");
+ expected.push_back("Irwell 3RD");
+ expected.push_back("Leeston 8704");
+
+ std::vector<std::string> lines;
+ GetFormattedNationalAddress(address, &lines);
+ EXPECT_EQ(expected, lines);
+
+ // Should be the same result no matter what the language_code is. We choose an
+ // unlikely language_code code to illustrate this.
+ address.language_code = "en-Latn-CN";
+
+ lines.clear();
+ GetFormattedNationalAddress(address, &lines);
+ EXPECT_EQ(expected, lines);
+
+ std::string one_line;
+ GetFormattedNationalAddressLine(address, &one_line);
+ EXPECT_EQ("Rotopapa, Irwell 3RD, Leeston 8704", one_line);
+}
+
+TEST(AddressFormatterTest, GetFormattedNationalAddressLatinFormat) {
+ /* 大安區 */
+ const char kTaiwanCity[] = "\xE5\xA4\xA7\xE5\xAE\x89\xE5\x8D\x80";
+ /* 台北市 */
+ const char kTaiwanAdmin[] = "\xE5\x8F\xB0\xE5\x8C\x97\xE5\xB8\x82";
+ /* 台灣信義路三段33號 */
+ const char kTaiwanStreetLine[]= "\xE5\x8F\xB0\xE7\x81\xA3\xE4\xBF\xA1\xE7";
+ "\xBE\xA9\xE8\xB7\xAF\xE4\xB8\x89\xE6\xAE\xB5" "33" "\xE8\x99\x9F";
+ const char kPostalCode[] = "106";
+
+ AddressData address;
+ address.region_code = "TW";
+ address.address_line.push_back(kTaiwanStreetLine);
+ address.postal_code = kPostalCode;
+ address.locality = kTaiwanCity;
+ address.administrative_area = kTaiwanAdmin;
+ address.language_code = "zh-Hant";
+
+ std::vector<std::string> expected;
+ expected.push_back(kPostalCode);
+ expected.push_back(std::string(kTaiwanAdmin).append(kTaiwanCity));
+ expected.push_back(kTaiwanStreetLine);
+
+ std::vector<std::string> lines;
+ GetFormattedNationalAddress(address, &lines);
+ EXPECT_EQ(expected, lines);
+
+ std::string one_line;
+ GetFormattedNationalAddressLine(address, &one_line);
+ // No separators expected for Chinese.
+ EXPECT_EQ(std::string(kPostalCode).append(kTaiwanAdmin).append(kTaiwanCity)
+ .append(kTaiwanStreetLine),
+ one_line);
+
+ // Changing to the latin variant will change the output.
+ AddressData latin_address;
+ latin_address.region_code = "TW";
+ latin_address.address_line.push_back("No. 33, Section 3 Xinyi Rd");
+ latin_address.postal_code = kPostalCode;
+ latin_address.locality = "Da-an District";
+ latin_address.administrative_area = "Taipei City";
+ latin_address.language_code = "zh-Latn";
+
+ std::vector<std::string> expected_latin;
+ expected_latin.push_back("No. 33, Section 3 Xinyi Rd");
+ expected_latin.push_back("Da-an District, Taipei City 106");
+
+ lines.clear();
+ GetFormattedNationalAddress(latin_address, &lines);
+ EXPECT_EQ(expected_latin, lines);
+
+ GetFormattedNationalAddressLine(latin_address, &one_line);
+ // We expect ", " as the new-line replacements for zh-Latn.
+ EXPECT_EQ("No. 33, Section 3 Xinyi Rd, Da-an District, Taipei City 106",
+ one_line);
+}
+
+TEST(AddressFormatterTest, GetFormattedNationalAddressMultilingualCountry) {
+ AddressData address;
+ address.region_code = "CA";
+ address.address_line.push_back("5 Rue du Tresor");
+ address.address_line.push_back("Apt. 4");
+ address.administrative_area = "QC";
+ address.postal_code = "G1R 123";
+ address.locality = "Montmagny";
+ address.language_code = "fr";
+
+ std::vector<std::string> expected;
+ expected.push_back("5 Rue du Tresor");
+ expected.push_back("Apt. 4");
+ expected.push_back("Montmagny QC G1R 123");
+
+ std::vector<std::string> lines;
+ GetFormattedNationalAddress(address, &lines);
+ EXPECT_EQ(expected, lines);
+}
+
+} // namespace