Enhance URL regular expression to match more Unicode chars.

Enhance URL regular expression to match legal one byte Unicode characters in
Internationalized Resource Identifiers as detailed in RFC 3987.  Specifically
two byte Unicode characters are not included.  Not all things in RFC 3987 is
implemented, this is just an enhancement for recognizing more common used one
byte Unicode characters.

This change helps Browser address bar identify more valid URL without scheme
typed in, such as 현금영수증.kr

make-iana-tld-pattern.py is modified to contain only Top Level Domain
regular expression generation.  Other parts of WEB_URL pattern are in
solely in Patters.java for better consistency and maintenance.
diff --git a/common/java/com/android/common/Patterns.java b/common/java/com/android/common/Patterns.java
index 71c3a5e..3b3b038 100644
--- a/common/java/com/android/common/Patterns.java
+++ b/common/java/com/android/common/Patterns.java
@@ -24,12 +24,12 @@
  */
 public class Patterns {
     /**
-     *  Regular expression pattern to match all IANA top-level domains.
+     *  Regular expression to match all IANA top-level domains.
      *  List accurate as of 2010/02/05.  List taken from:
      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
-     *  This pattern is auto-generated by development/tools/make-iana-tld-pattern.py
+     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
      */
-    public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+    public static final String TOP_LEVEL_DOMAIN_STR =
         "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
         + "|(biz|b[abdefghijmnorstvwyz])"
         + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
@@ -55,20 +55,22 @@
         + "|w[fs]"
         + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)"
         + "|y[etu]"
-        + "|z[amw])");
+        + "|z[amw])";
 
     /**
-     *  Regular expression pattern to match RFC 1738 URLs
+     *  Regular expression pattern to match all IANA top-level domains.
+     */
+    public static final Pattern TOP_LEVEL_DOMAIN =
+        Pattern.compile(TOP_LEVEL_DOMAIN_STR);
+
+    /**
+     *  Regular expression to match all IANA top-level domains for WEB_URL.
      *  List accurate as of 2010/02/05.  List taken from:
      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
-     *  This pattern is auto-generated by development/tools/make-iana-tld-pattern.py
+     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
      */
-    public static final Pattern WEB_URL = Pattern.compile(
-        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
-        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
-        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
-        + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
-        + "(?:"   // plus top level domain
+    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
+        "(?:"
         + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
         + "|(?:biz|b[abdefghijmnorstvwyz])"
         + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
@@ -94,7 +96,28 @@
         + "|w[fs]"
         + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)"
         + "|y[etu]"
-        + "|z[amw]))"
+        + "|z[amw]))";
+
+    /**
+     * Good characters for Internationalized Resource Identifiers (IRI).
+     * This comprises most common used Unicode characters allowed in IRI
+     * as detailed in RFC 3987.
+     * Specifically, those two byte Unicode characters are not included.
+     */
+    public static final String GOOD_IRI_CHAR =
+        "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
+
+    /**
+     *  Regular expression pattern to match most part of RFC 3987
+     *  Internationalized URLs, aka IRIs.  Commonly used Unicode characters are
+     *  added.
+     */
+    public static final Pattern WEB_URL = Pattern.compile(
+        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+        + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+"   // named host
+        + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
         + "|(?:(?:25[0-5]|2[0-4]" // or ip address
         + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
         + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
@@ -116,7 +139,7 @@
 
     public static final Pattern DOMAIN_NAME
         = Pattern.compile(
-            "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+"
+            "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+"
             + TOP_LEVEL_DOMAIN + ")|"
             + IP_ADDRESS + ")");
 
diff --git a/common/tests/src/com/android/common/PatternsTest.java b/common/tests/src/com/android/common/PatternsTest.java
index 635601e..9e2ad58 100644
--- a/common/tests/src/com/android/common/PatternsTest.java
+++ b/common/tests/src/com/android/common/PatternsTest.java
@@ -68,6 +68,12 @@
         t = Patterns.WEB_URL.matcher("xn--fsqu00a.xn--0zwm56d").matches();
         assertTrue("Valid URL", t);
 
+        // Internationalized URL.
+        t = Patterns.WEB_URL.matcher("http://\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
+        assertTrue("Valid URL", t);
+        t = Patterns.WEB_URL.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
+        assertTrue("Valid URL", t);
+
         t = Patterns.WEB_URL.matcher("ftp://www.example.com").matches();
         assertFalse("Matched invalid protocol", t);
 
@@ -99,6 +105,13 @@
         t = Patterns.DOMAIN_NAME.matcher("mail.example.com").matches();
         assertTrue("Valid domain", t);
 
+        t = Patterns.WEB_URL.matcher("google.me").matches();
+        assertTrue("Valid domain", t);
+
+        // Internationalized domains.
+        t = Patterns.DOMAIN_NAME.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
+        assertTrue("Valid domain", t);
+
         t = Patterns.DOMAIN_NAME.matcher("__+&42.xer").matches();
         assertFalse("Invalid domain", t);
     }
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py
index ece4dcf..de81c58 100755
--- a/common/tools/make-iana-tld-pattern.py
+++ b/common/tools/make-iana-tld-pattern.py
@@ -4,43 +4,27 @@
 
 TLD_PREFIX = r"""
     /**
-     *  Regular expression pattern to match all IANA top-level domains.
+     *  Regular expression to match all IANA top-level domains.
      *  List accurate as of 2010/02/05.  List taken from:
      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
      *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
      */
-    public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+    public static final String TOP_LEVEL_DOMAIN_STR =
 """
-TLD_SUFFIX = '");'
+TLD_SUFFIX = '";'
 
 URL_PREFIX = r"""
     /**
-     *  Regular expression pattern to match RFC 1738 URLs
+     *  Regular expression to match all IANA top-level domains for WEB_URL.
      *  List accurate as of 2010/02/05.  List taken from:
      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
-     *  This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
+     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
      */
-    public static final Pattern WEB_URL = Pattern.compile(
-        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
-        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
-        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
-        + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
-        + "(?:"   // plus top level domain
+    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
+        "(?:"
 """
 
-URL_SUFFIX = r"""
-        + "|(?:(?:25[0-5]|2[0-4]" // or ip address
-        + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
-        + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
-        + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
-        + "|[1-9][0-9]|[0-9])))"
-        + "(?:\\:\\d{1,5})?)" // plus option port number
-        + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
-        + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
-        + "(?:\\b|$)"); // and finally, a word boundary or end of
-                        // input.  This is to stop foo.sure from
-                        // matching as foo.su
-"""
+URL_SUFFIX = ';'
 
 class Bucket:
     def __init__(self, baseLetter):
diff --git a/core/java/android/net/WebAddress.java b/core/java/android/net/WebAddress.java
index f4ae66a..2d078c1 100644
--- a/core/java/android/net/WebAddress.java
+++ b/core/java/android/net/WebAddress.java
@@ -16,6 +16,8 @@
 
 package android.net;
 
+import static com.android.common.Patterns.GOOD_IRI_CHAR;
+
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -54,7 +56,7 @@
     static Pattern sAddressPattern = Pattern.compile(
             /* scheme    */ "(?:(http|HTTP|https|HTTPS|file|FILE)\\:\\/\\/)?" +
             /* authority */ "(?:([-A-Za-z0-9$_.+!*'(),;?&=]+(?:\\:[-A-Za-z0-9$_.+!*'(),;?&=]+)?)@)?" +
-            /* host      */ "([-A-Za-z0-9%_]+(?:\\.[-A-Za-z0-9%_]+)*|\\[[0-9a-fA-F:\\.]+\\])?" +
+            /* host      */ "([-" + GOOD_IRI_CHAR + "%_]+(?:\\.[-" + GOOD_IRI_CHAR + "%_]+)*|\\[[0-9a-fA-F:\\.]+\\])?" +
             /* port      */ "(?:\\:([0-9]+))?" +
             /* path      */ "(\\/?.*)?");