Adding address formatting functionality - on a single line, on multiple lines, and combining multiple street address lines into one, in a language-dependent fashion.

commit: 2dd9a2f787278f8a5c20c7d673327e2a61b04e96 [log] [tgz]
author: Lara Scheidegger <lararennie@google.com> Thu May 22 09:10:22 2014 +0000
committer: Fredrik Roubert <roubert@google.com> Mon Sep 01 19:20:44 2014 +0200
tree: 29a6c14a4bf4790cf8b2e50ae5ad9a339cbfc6e7
parent: 491754991ce2951f0e405455c661341fffab036f [diff]
diff --git a/cpp/include/libaddressinput/address_formatter.h b/cpp/include/libaddressinput/address_formatter.h
new file mode 100644
index 0000000..56d2c45
--- /dev/null
+++ b/cpp/include/libaddressinput/address_formatter.h

@@ -0,0 +1,49 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Utility functions for formatting the addresses represented as AddressData.
+
+#ifndef I18N_ADDRESSINPUT_ADDRESS_FORMATTER_H
+#define I18N_ADDRESSINPUT_ADDRESS_FORMATTER_H
+
+#include <libaddressinput/address_formatter.h>
+
+#include <string>
+#include <vector>
+
+namespace i18n {
+namespace addressinput {
+
+struct AddressData;
+
+// Formats the address onto multiple lines. This formats the address in national
+// format; without the country.
+void GetFormattedNationalAddress(
+    const AddressData& address_data, std::vector<std::string>* lines);
+
+// Formats the address as a single line. This formats the address in national
+// format; without the country.
+void GetFormattedNationalAddressLine(
+    const AddressData& address_data, std::string* line);
+
+// Formats the street-level part of an address as a single line. For example,
+// two lines of "Apt 1", "10 Red St." will be concatenated in a
+// language-appropriate way, to give something like "Apt 1, 10 Red St".
+void GetStreetAddressLinesAsSingleLine(
+    const AddressData& address_data, std::string* line);
+
+}  // namespace addressinput
+}  // namespace i18n
+
+#endif  // I18N_ADDRESSINPUT_ADDRESS_FORMATTER_H

diff --git a/cpp/libaddressinput.gypi b/cpp/libaddressinput.gypi
index cfb63c4..1c2f2f2 100644
--- a/cpp/libaddressinput.gypi
+++ b/cpp/libaddressinput.gypi

@@ -17,6 +17,7 @@
       'src/address_data.cc',
       'src/address_field.cc',
       'src/address_field_util.cc',
+      'src/address_formatter.cc',
       'src/address_problem.cc',
       'src/address_ui.cc',
       'src/address_validator.cc',
@@ -46,6 +47,7 @@
       'test/address_data_test.cc',
       'test/address_field_test.cc',
       'test/address_field_util_test.cc',
+      'test/address_formatter_test.cc',
       'test/address_problem_test.cc',
       'test/address_ui_test.cc',
       'test/address_validator_test.cc',

diff --git a/cpp/src/address_formatter.cc b/cpp/src/address_formatter.cc
new file mode 100644
index 0000000..872d381
--- /dev/null
+++ b/cpp/src/address_formatter.cc

@@ -0,0 +1,191 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include <libaddressinput/address_data.h>
+#include <libaddressinput/address_field.h>
+
+#include "format_element.h"
+#include "language.h"
+#include "region_data_constants.h"
+#include "rule.h"
+
+namespace i18n {
+namespace addressinput {
+
+namespace {
+
+const char kCommaSeparator[] = ", ";
+const char kSpaceSeparator[] = " ";
+const char kArabicCommaSeparator[] = "\xD8\x8C" " ";  /* "، " */
+
+const char* kLanguagesThatUseSpace[] = {
+  "th",
+  "ko"
+};
+
+const char* kLanguagesThatHaveNoSeparator[] = {
+  "ja",
+  "zh"  // All Chinese variants.
+};
+
+// This data is based on CLDR, for languages that are in official use in some
+// country, where Arabic is the most likely script tag.
+// TODO: Consider supporting variants such as tr-Arab by detecting the script
+// code.
+const char* kLanguagesThatUseAnArabicComma[] = {
+  "ar",
+  "az",
+  "fa",
+  "kk",
+  "ku",
+  "ky",
+  "ps",
+  "tg",
+  "tk",
+  "ur",
+  "uz"
+};
+
+// Case insensitive matcher for language tags.
+struct LanguageMatcher {
+  LanguageMatcher(const std::string& tag) : tag(tag) {}
+  std::string tag;
+  bool operator() (const std::string& s) {
+    return strcasecmp(tag.c_str(), s.c_str()) == 0;
+  }
+};
+
+std::string GetLineSeparatorForLanguage(const std::string& language_tag) {
+  Language address_language(language_tag);
+
+  // First deal with explicit script tags.
+  if (address_language.has_latin_script) {
+    return kCommaSeparator;
+  }
+
+  // Now guess something appropriate based on the base language.
+  std::string base_language = address_language.base;
+  if (std::find_if(kLanguagesThatUseSpace,
+                   kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace),
+                   LanguageMatcher(base_language)) !=
+      kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace)) {
+    return kSpaceSeparator;
+  } else if (std::find_if(kLanguagesThatHaveNoSeparator,
+                          kLanguagesThatHaveNoSeparator +
+                              arraysize(kLanguagesThatHaveNoSeparator),
+                          LanguageMatcher(base_language)) !=
+             kLanguagesThatHaveNoSeparator +
+                 arraysize(kLanguagesThatHaveNoSeparator)) {
+    return "";
+  } else if (std::find_if(kLanguagesThatUseAnArabicComma,
+                          kLanguagesThatUseAnArabicComma +
+                              arraysize(kLanguagesThatUseAnArabicComma),
+                          LanguageMatcher(base_language)) !=
+             kLanguagesThatUseAnArabicComma +
+                 arraysize(kLanguagesThatUseAnArabicComma)) {
+    return kArabicCommaSeparator;
+  }
+  // Either the language is a latin-script language, or no language was
+  // specified. In the latter case we still return ", " as the most common
+  // separator in use. In countries that don't use this, e.g. Thailand,
+  // addresses are often written in latin script where this would still be
+  // appropriate, so this is a reasonable default in the absence of information.
+  return kCommaSeparator;
+}
+
+void CombineLinesForLanguage(
+    const std::vector<std::string>& lines, const std::string& language_tag,
+    std::string *line) {
+  if (lines.size() > 0) {
+    line->assign(lines[0]);
+  }
+  std::string separator = GetLineSeparatorForLanguage(language_tag);
+  for (std::vector<std::string>::const_iterator it = lines.begin() + 1;
+       it < lines.end(); ++it) {
+    line->append(separator);
+    line->append(*it);
+  }
+}
+
+}  // namespace
+
+void GetFormattedNationalAddress(
+    const AddressData& address_data, std::vector<std::string>* lines) {
+  assert(lines != NULL);
+  lines->clear();
+
+  Rule rule;
+  rule.CopyFrom(Rule::GetDefault());
+  // TODO: Eventually, we should get the best rule for this country and
+  // language, rather than just for the country.
+  rule.ParseSerializedRule(RegionDataConstants::GetRegionData(
+      address_data.region_code));
+
+  Language language(address_data.language_code);
+
+  // If latinized rules are available and the |language_code| of this address is
+  // explicitly tagged as being Latin, then use the latinized formatting rules.
+  const std::vector<FormatElement>& format =
+      language.has_latin_script && !rule.GetLatinFormat().empty()
+          ? rule.GetLatinFormat() : rule.GetFormat();
+
+  std::string line;
+  for (size_t i = 0; i < format.size(); ++i) {
+    FormatElement element = format[i];
+    if (element.IsNewline()) {
+      if (!line.empty()) {
+        lines->push_back(line);
+        line.clear();
+      }
+    } else if (element.IsField()) {
+      AddressField field = element.GetField();
+      if (field == STREET_ADDRESS) {
+        // The field "street address" represents the street address lines of an
+        // address, so there can be multiple values.
+        if (!line.empty()) {
+          lines->push_back(line);
+          line.clear();
+        }
+        lines->insert(lines->end(), address_data.address_line.begin(),
+                      address_data.address_line.end());
+      } else {
+        line.append(address_data.GetFieldValue(field));
+      }
+    } else {
+      line.append(element.GetLiteral());
+    }
+  }
+  if (!line.empty()) {
+    lines->push_back(line);
+  }
+}
+
+void GetFormattedNationalAddressLine(
+    const AddressData& address_data, std::string* line) {
+  std::vector<std::string> address_lines;
+  GetFormattedNationalAddress(address_data, &address_lines);
+  CombineLinesForLanguage(address_lines, address_data.language_code, line);
+}
+
+void GetStreetAddressLinesAsSingleLine(
+    const AddressData& address_data, std::string* line) {
+  CombineLinesForLanguage(
+      address_data.address_line, address_data.language_code, line);
+}
+
+}  // namespace addressinput
+}  // namespace i18n

diff --git a/cpp/src/format_element.h b/cpp/src/format_element.h
index 0fc3d56..e37a188 100644
--- a/cpp/src/format_element.h
+++ b/cpp/src/format_element.h

@@ -44,7 +44,7 @@
   // Returns true if this element represents a new line.
   bool IsNewline() const { return literal_ == "\n"; }
 
-  const AddressField& GetField() const { return field_; }
+  AddressField GetField() const { return field_; }
   const std::string& GetLiteral() const { return literal_; }
 
   bool operator==(const FormatElement& other) const;

diff --git a/cpp/test/address_formatter_test.cc b/cpp/test/address_formatter_test.cc
new file mode 100644
index 0000000..ae5b847
--- /dev/null
+++ b/cpp/test/address_formatter_test.cc

@@ -0,0 +1,201 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language_code governing permissions and
+// limitations under the License.
+
+#include <libaddressinput/address_formatter.h>
+
+#include <libaddressinput/address_data.h>
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+using i18n::addressinput::AddressData;
+using i18n::addressinput::GetStreetAddressLinesAsSingleLine;
+using i18n::addressinput::GetFormattedNationalAddress;
+using i18n::addressinput::GetFormattedNationalAddressLine;
+
+TEST(AddressFormatterTest, GetStreetAddressLinesAsSingleLine_1Line) {
+  AddressData address;
+  address.region_code = "US";  // Not used.
+  address.address_line.push_back("Line 1");
+
+  std::string result;
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  EXPECT_EQ("Line 1", result);
+
+  // Setting the language_code, with one line, shouldn't affect anything.
+  address.language_code = "en";
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  EXPECT_EQ("Line 1", result);
+
+  address.language_code = "zh-Hans";
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  EXPECT_EQ("Line 1", result);
+}
+
+TEST(AddressFormatterTest, GetStreetAddressLinesAsSingleLine_2Lines) {
+  AddressData address;
+  address.region_code = "US";  // Not used.
+  address.address_line.push_back("Line 1");
+  address.address_line.push_back("Line 2");
+
+  std::string result;
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  // Default separator if no language_code specified: ", "
+  EXPECT_EQ("Line 1, Line 2", result);
+
+  address.language_code = "en";
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  EXPECT_EQ("Line 1, Line 2", result);
+
+  address.language_code = "zh-Hans";
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  // Chinese has no separator.
+  EXPECT_EQ("Line 1Line 2", result);
+
+  address.language_code = "ko";
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  EXPECT_EQ("Line 1 Line 2", result);
+
+  address.language_code = "ar";
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  EXPECT_EQ("Line 1\xD8\x8C" " Line 2", result);  // Arabic comma.
+}
+
+TEST(AddressFormatterTest, GetStreetAddressLinesAsSingleLine_5Lines) {
+  AddressData address;
+  address.region_code = "US";  // Not used.
+  address.address_line.push_back("Line 1");
+  address.address_line.push_back("Line 2");
+  address.address_line.push_back("Line 3");
+  address.address_line.push_back("Line 4");
+  address.address_line.push_back("Line 5");
+  address.language_code = "fr";
+
+  std::string result;
+  GetStreetAddressLinesAsSingleLine(address, &result);
+  EXPECT_EQ(result, "Line 1, Line 2, Line 3, Line 4, Line 5");
+}
+
+TEST(AddressFormatterTest, GetFormattedNationalAddressLocalLanguage) {
+  AddressData address;
+  address.region_code = "NZ";
+  address.address_line.push_back("Rotopapa");
+  address.address_line.push_back("Irwell 3RD");
+  address.postal_code = "8704";
+  address.locality = "Leeston";
+
+  std::vector<std::string> expected;
+  expected.push_back("Rotopapa");
+  expected.push_back("Irwell 3RD");
+  expected.push_back("Leeston 8704");
+
+  std::vector<std::string> lines;
+  GetFormattedNationalAddress(address, &lines);
+  EXPECT_EQ(expected, lines);
+
+  // Should be the same result no matter what the language_code is. We choose an
+  // unlikely language_code code to illustrate this.
+  address.language_code = "en-Latn-CN";
+
+  lines.clear();
+  GetFormattedNationalAddress(address, &lines);
+  EXPECT_EQ(expected, lines);
+
+  std::string one_line;
+  GetFormattedNationalAddressLine(address, &one_line);
+  EXPECT_EQ("Rotopapa, Irwell 3RD, Leeston 8704", one_line);
+}
+
+TEST(AddressFormatterTest, GetFormattedNationalAddressLatinFormat) {
+  /* 大安區 */
+  const char kTaiwanCity[] = "\xE5\xA4\xA7\xE5\xAE\x89\xE5\x8D\x80";
+  /* 台北市 */
+  const char kTaiwanAdmin[] = "\xE5\x8F\xB0\xE5\x8C\x97\xE5\xB8\x82";
+  /* 台灣信義路三段33號 */
+  const char kTaiwanStreetLine[]= "\xE5\x8F\xB0\xE7\x81\xA3\xE4\xBF\xA1\xE7";
+    "\xBE\xA9\xE8\xB7\xAF\xE4\xB8\x89\xE6\xAE\xB5" "33" "\xE8\x99\x9F";
+  const char kPostalCode[] = "106";
+
+  AddressData address;
+  address.region_code = "TW";
+  address.address_line.push_back(kTaiwanStreetLine);
+  address.postal_code = kPostalCode;
+  address.locality = kTaiwanCity;
+  address.administrative_area = kTaiwanAdmin;
+  address.language_code = "zh-Hant";
+
+  std::vector<std::string> expected;
+  expected.push_back(kPostalCode);
+  expected.push_back(std::string(kTaiwanAdmin).append(kTaiwanCity));
+  expected.push_back(kTaiwanStreetLine);
+
+  std::vector<std::string> lines;
+  GetFormattedNationalAddress(address, &lines);
+  EXPECT_EQ(expected, lines);
+
+  std::string one_line;
+  GetFormattedNationalAddressLine(address, &one_line);
+  // No separators expected for Chinese.
+  EXPECT_EQ(std::string(kPostalCode).append(kTaiwanAdmin).append(kTaiwanCity)
+            .append(kTaiwanStreetLine),
+            one_line);
+
+  // Changing to the latin variant will change the output.
+  AddressData latin_address;
+  latin_address.region_code = "TW";
+  latin_address.address_line.push_back("No. 33, Section 3 Xinyi Rd");
+  latin_address.postal_code = kPostalCode;
+  latin_address.locality = "Da-an District";
+  latin_address.administrative_area = "Taipei City";
+  latin_address.language_code = "zh-Latn";
+
+  std::vector<std::string> expected_latin;
+  expected_latin.push_back("No. 33, Section 3 Xinyi Rd");
+  expected_latin.push_back("Da-an District, Taipei City 106");
+
+  lines.clear();
+  GetFormattedNationalAddress(latin_address, &lines);
+  EXPECT_EQ(expected_latin, lines);
+
+  GetFormattedNationalAddressLine(latin_address, &one_line);
+  // We expect ", " as the new-line replacements for zh-Latn.
+  EXPECT_EQ("No. 33, Section 3 Xinyi Rd, Da-an District, Taipei City 106",
+            one_line);
+}
+
+TEST(AddressFormatterTest, GetFormattedNationalAddressMultilingualCountry) {
+  AddressData address;
+  address.region_code = "CA";
+  address.address_line.push_back("5 Rue du Tresor");
+  address.address_line.push_back("Apt. 4");
+  address.administrative_area = "QC";
+  address.postal_code = "G1R 123";
+  address.locality = "Montmagny";
+  address.language_code = "fr";
+
+  std::vector<std::string> expected;
+  expected.push_back("5 Rue du Tresor");
+  expected.push_back("Apt. 4");
+  expected.push_back("Montmagny QC G1R 123");
+
+  std::vector<std::string> lines;
+  GetFormattedNationalAddress(address, &lines);
+  EXPECT_EQ(expected, lines);
+}
+
+}  // namespace
commit	2dd9a2f787278f8a5c20c7d673327e2a61b04e96	[log] [tgz]
author	Lara Scheidegger <lararennie@google.com>	Thu May 22 09:10:22 2014 +0000
committer	Fredrik Roubert <roubert@google.com>	Mon Sep 01 19:20:44 2014 +0200
tree	29a6c14a4bf4790cf8b2e50ae5ad9a339cbfc6e7
parent	491754991ce2951f0e405455c661341fffab036f [diff]