Reapply an upstream OkHttp change to Android for HttpUrl

Manually reapplied change from upstream to address HttpUrl encoding issues.
This change did not apply cleanly. Upstream FormEncodingBuilder has
been refactored. Equivalent changes have been made.

Upstream details:
Comment:
Never throw converting an HttpUrl to a java.net.URI.

Just do arbitrary amounts of transformation. Sigh.

Closes https://github.com/square/okhttp/issues/2116
SHA: d77edcc8905148f18a691be180c4f8f366a5ee1b

Bug: 27590872
Change-Id: Ia12df5dc95c1681c67552b2c7725393508f52c0e
diff --git a/okhttp-tests/src/test/java/com/squareup/okhttp/HttpUrlTest.java b/okhttp-tests/src/test/java/com/squareup/okhttp/HttpUrlTest.java
index d2fc31d..71deb6c 100644
--- a/okhttp-tests/src/test/java/com/squareup/okhttp/HttpUrlTest.java
+++ b/okhttp-tests/src/test/java/com/squareup/okhttp/HttpUrlTest.java
@@ -483,12 +483,7 @@
     assertEquals("http://host/#\u0080", url.toString());
     assertEquals("\u0080", url.fragment());
     assertEquals("\u0080", url.encodedFragment());
-    try {
-      url.uri();
-      fail();
-    } catch (IllegalStateException expected) {
-      // Possibly a bug in java.net.URI. Many non-ASCII code points work, this one doesn't!
-    }
+    assertEquals(new URI("http://host/#"), url.uri()); // Control characters may be stripped!
   }
 
   @Test public void fragmentPercentEncodedNonAscii() throws Exception {
@@ -1056,19 +1051,46 @@
     assertEquals("http://host/#=[]:;%22~%7C?%23@%5E/$%25*", url.uri().toString());
   }
 
-  @Test public void toUriWithMalformedPercentEscape() throws Exception {
-    HttpUrl url = new HttpUrl.Builder()
-        .scheme("http")
-        .host("host")
-        .encodedPath("/%xx")
-        .build();
-    assertEquals("http://host/%xx", url.toString());
-    try {
-      url.uri();
-      fail();
-    } catch (IllegalStateException expected) {
-      assertEquals("not valid as a java.net.URI: http://host/%xx", expected.getMessage());
-    }
+  @Test public void toUriWithControlCharacters() throws Exception {
+    // Percent-encoded in the path.
+    assertEquals(new URI("http://host/a%00b"), HttpUrl.parse("http://host/a\u0000b").uri());
+    assertEquals(new URI("http://host/a%C2%80b"), HttpUrl.parse("http://host/a\u0080b").uri());
+    assertEquals(new URI("http://host/a%C2%9Fb"), HttpUrl.parse("http://host/a\u009fb").uri());
+    // Percent-encoded in the query.
+    assertEquals(new URI("http://host/?a%00b"), HttpUrl.parse("http://host/?a\u0000b").uri());
+    assertEquals(new URI("http://host/?a%C2%80b"), HttpUrl.parse("http://host/?a\u0080b").uri());
+    assertEquals(new URI("http://host/?a%C2%9Fb"), HttpUrl.parse("http://host/?a\u009fb").uri());
+    // Stripped from the fragment.
+    assertEquals(new URI("http://host/#a%00b"), HttpUrl.parse("http://host/#a\u0000b").uri());
+    assertEquals(new URI("http://host/#ab"), HttpUrl.parse("http://host/#a\u0080b").uri());
+    assertEquals(new URI("http://host/#ab"), HttpUrl.parse("http://host/#a\u009fb").uri());
+  }
+
+  @Test public void toUriWithSpaceCharacters() throws Exception {
+    // Percent-encoded in the path.
+    assertEquals(new URI("http://host/a%0Bb"), HttpUrl.parse("http://host/a\u000bb").uri());
+    assertEquals(new URI("http://host/a%20b"), HttpUrl.parse("http://host/a b").uri());
+    assertEquals(new URI("http://host/a%E2%80%89b"), HttpUrl.parse("http://host/a\u2009b").uri());
+    assertEquals(new URI("http://host/a%E3%80%80b"), HttpUrl.parse("http://host/a\u3000b").uri());
+    // Percent-encoded in the query.
+    assertEquals(new URI("http://host/?a%0Bb"), HttpUrl.parse("http://host/?a\u000bb").uri());
+    assertEquals(new URI("http://host/?a%20b"), HttpUrl.parse("http://host/?a b").uri());
+    assertEquals(new URI("http://host/?a%E2%80%89b"), HttpUrl.parse("http://host/?a\u2009b").uri());
+    assertEquals(new URI("http://host/?a%E3%80%80b"), HttpUrl.parse("http://host/?a\u3000b").uri());
+    // Stripped from the fragment.
+    assertEquals(new URI("http://host/#a%0Bb"), HttpUrl.parse("http://host/#a\u000bb").uri());
+    assertEquals(new URI("http://host/#a%20b"), HttpUrl.parse("http://host/#a b").uri());
+    assertEquals(new URI("http://host/#ab"), HttpUrl.parse("http://host/#a\u2009b").uri());
+    assertEquals(new URI("http://host/#ab"), HttpUrl.parse("http://host/#a\u3000b").uri());
+  }
+
+  @Test public void toUriWithNonHexPercentEscape() throws Exception {
+    assertEquals(new URI("http://host/%25xx"), HttpUrl.parse("http://host/%xx").uri());
+  }
+
+  @Test public void toUriWithTruncatedPercentEscape() throws Exception {
+    assertEquals(new URI("http://host/%25a"), HttpUrl.parse("http://host/%a").uri());
+    assertEquals(new URI("http://host/%25"), HttpUrl.parse("http://host/%").uri());
   }
 
   @Test public void fromJavaNetUrl() throws Exception {
diff --git a/okhttp/src/main/java/com/squareup/okhttp/FormEncodingBuilder.java b/okhttp/src/main/java/com/squareup/okhttp/FormEncodingBuilder.java
index f5134d9..96f6917 100644
--- a/okhttp/src/main/java/com/squareup/okhttp/FormEncodingBuilder.java
+++ b/okhttp/src/main/java/com/squareup/okhttp/FormEncodingBuilder.java
@@ -33,10 +33,10 @@
       content.writeByte('&');
     }
     HttpUrl.canonicalize(content, name, 0, name.length(),
-        HttpUrl.FORM_ENCODE_SET, false, true, true);
+        HttpUrl.FORM_ENCODE_SET, false, false, true, true);
     content.writeByte('=');
     HttpUrl.canonicalize(content, value, 0, value.length(),
-        HttpUrl.FORM_ENCODE_SET, false, true, true);
+        HttpUrl.FORM_ENCODE_SET, false, false, true, true);
     return this;
   }
 
@@ -46,10 +46,10 @@
       content.writeByte('&');
     }
     HttpUrl.canonicalize(content, name, 0, name.length(),
-        HttpUrl.FORM_ENCODE_SET, true, true, true);
+        HttpUrl.FORM_ENCODE_SET, true, false, true, true);
     content.writeByte('=');
     HttpUrl.canonicalize(content, value, 0, value.length(),
-        HttpUrl.FORM_ENCODE_SET, true, true, true);
+        HttpUrl.FORM_ENCODE_SET, true, false, true, true);
     return this;
   }
 
diff --git a/okhttp/src/main/java/com/squareup/okhttp/HttpUrl.java b/okhttp/src/main/java/com/squareup/okhttp/HttpUrl.java
index 68b09ef..beabeca 100644
--- a/okhttp/src/main/java/com/squareup/okhttp/HttpUrl.java
+++ b/okhttp/src/main/java/com/squareup/okhttp/HttpUrl.java
@@ -327,19 +327,29 @@
   }
 
   /**
-   * Returns this URL as a {@link URI java.net.URI}. Because {@code URI} forbids certain characters
-   * like {@code [} and {@code |}, the returned URI may escape more characters than this URL.
+   * Returns this URL as a {@link URI java.net.URI}. Because {@code URI} is more strict than this
+   * class, the returned URI may be semantically different from this URL:
+   * <ul>
+   *   <li>Characters forbidden by URI like {@code [} and {@code |} will be escaped.
+   *   <li>Invalid percent-encoded sequences like {@code %xx} will be encoded like {@code %25xx}.
+   *   <li>Whitespace and control characters in the fragment will be stripped.
+   * </ul>
    *
-   * <p>This method throws an unchecked {@link IllegalStateException} if it cannot be converted to a
-   * URI even after escaping forbidden characters. In particular, URLs that contain malformed
-   * percent escapes like {@code http://host/%xx} will trigger this exception.
+   * <p>These differences may have a significant consequence when the URI is interpretted by a
+   * webserver. For this reason the {@linkplain URI URI class} and this method should be avoided.
    */
   public URI uri() {
+    String uri = newBuilder().reencodeForUri().toString();
     try {
-      String uri = newBuilder().reencodeForUri().toString();
       return new URI(uri);
     } catch (URISyntaxException e) {
-      throw new IllegalStateException("not valid as a java.net.URI: " + url);
+      // Unlikely edge case: the URI has a forbidden character in the fragment. Strip it & retry.
+      try {
+        String stripped = uri.replaceAll("[\\u0000-\\u001F\\u007F-\\u009F\\p{javaWhitespace}]", "");
+        return URI.create(stripped);
+      } catch (Exception e1) {
+        throw new RuntimeException(e); // Unexpected!
+      }
     }
   }
 
@@ -686,25 +696,27 @@
 
     public Builder username(String username) {
       if (username == null) throw new IllegalArgumentException("username == null");
-      this.encodedUsername = canonicalize(username, USERNAME_ENCODE_SET, false, false, true);
+      this.encodedUsername = canonicalize(username, USERNAME_ENCODE_SET, false, false, false, true);
       return this;
     }
 
     public Builder encodedUsername(String encodedUsername) {
       if (encodedUsername == null) throw new IllegalArgumentException("encodedUsername == null");
-      this.encodedUsername = canonicalize(encodedUsername, USERNAME_ENCODE_SET, true, false, true);
+      this.encodedUsername = canonicalize(
+          encodedUsername, USERNAME_ENCODE_SET, true, false, false, true);
       return this;
     }
 
     public Builder password(String password) {
       if (password == null) throw new IllegalArgumentException("password == null");
-      this.encodedPassword = canonicalize(password, PASSWORD_ENCODE_SET, false, false, true);
+      this.encodedPassword = canonicalize(password, PASSWORD_ENCODE_SET, false, false, false, true);
       return this;
     }
 
     public Builder encodedPassword(String encodedPassword) {
       if (encodedPassword == null) throw new IllegalArgumentException("encodedPassword == null");
-      this.encodedPassword = canonicalize(encodedPassword, PASSWORD_ENCODE_SET, true, false, true);
+      this.encodedPassword = canonicalize(
+          encodedPassword, PASSWORD_ENCODE_SET, true, false, false, true);
       return this;
     }
 
@@ -747,7 +759,7 @@
     public Builder setPathSegment(int index, String pathSegment) {
       if (pathSegment == null) throw new IllegalArgumentException("pathSegment == null");
       String canonicalPathSegment = canonicalize(
-          pathSegment, 0, pathSegment.length(), PATH_SEGMENT_ENCODE_SET, false, false, true);
+          pathSegment, 0, pathSegment.length(), PATH_SEGMENT_ENCODE_SET, false, false, false, true);
       if (isDot(canonicalPathSegment) || isDotDot(canonicalPathSegment)) {
         throw new IllegalArgumentException("unexpected path segment: " + pathSegment);
       }
@@ -760,7 +772,7 @@
         throw new IllegalArgumentException("encodedPathSegment == null");
       }
       String canonicalPathSegment = canonicalize(encodedPathSegment,
-          0, encodedPathSegment.length(), PATH_SEGMENT_ENCODE_SET, true, false, true);
+          0, encodedPathSegment.length(), PATH_SEGMENT_ENCODE_SET, true, false, false, true);
       encodedPathSegments.set(index, canonicalPathSegment);
       if (isDot(canonicalPathSegment) || isDotDot(canonicalPathSegment)) {
         throw new IllegalArgumentException("unexpected path segment: " + encodedPathSegment);
@@ -787,7 +799,8 @@
 
     public Builder query(String query) {
       this.encodedQueryNamesAndValues = query != null
-          ? queryStringToNamesAndValues(canonicalize(query, QUERY_ENCODE_SET, false, true, true))
+          ? queryStringToNamesAndValues(canonicalize(
+              query, QUERY_ENCODE_SET, false, false, true, true))
           : null;
       return this;
     }
@@ -795,7 +808,7 @@
     public Builder encodedQuery(String encodedQuery) {
       this.encodedQueryNamesAndValues = encodedQuery != null
           ? queryStringToNamesAndValues(
-              canonicalize(encodedQuery, QUERY_ENCODE_SET, true, true, true))
+              canonicalize(encodedQuery, QUERY_ENCODE_SET, true, false, true, true))
           : null;
       return this;
     }
@@ -805,9 +818,9 @@
       if (name == null) throw new IllegalArgumentException("name == null");
       if (encodedQueryNamesAndValues == null) encodedQueryNamesAndValues = new ArrayList<>();
       encodedQueryNamesAndValues.add(
-          canonicalize(name, QUERY_COMPONENT_ENCODE_SET, false, true, true));
+          canonicalize(name, QUERY_COMPONENT_ENCODE_SET, false, false, true, true));
       encodedQueryNamesAndValues.add(value != null
-          ? canonicalize(value, QUERY_COMPONENT_ENCODE_SET, false, true, true)
+          ? canonicalize(value, QUERY_COMPONENT_ENCODE_SET, false, false, true, true)
           : null);
       return this;
     }
@@ -817,9 +830,9 @@
       if (encodedName == null) throw new IllegalArgumentException("encodedName == null");
       if (encodedQueryNamesAndValues == null) encodedQueryNamesAndValues = new ArrayList<>();
       encodedQueryNamesAndValues.add(
-          canonicalize(encodedName, QUERY_COMPONENT_ENCODE_SET, true, true, true));
+          canonicalize(encodedName, QUERY_COMPONENT_ENCODE_SET, true, false, true, true));
       encodedQueryNamesAndValues.add(encodedValue != null
-          ? canonicalize(encodedValue, QUERY_COMPONENT_ENCODE_SET, true, true, true)
+          ? canonicalize(encodedValue, QUERY_COMPONENT_ENCODE_SET, true, false, true, true)
           : null);
       return this;
     }
@@ -839,7 +852,8 @@
     public Builder removeAllQueryParameters(String name) {
       if (name == null) throw new IllegalArgumentException("name == null");
       if (encodedQueryNamesAndValues == null) return this;
-      String nameToRemove = canonicalize(name, QUERY_COMPONENT_ENCODE_SET, false, true, true);
+      String nameToRemove = canonicalize(
+          name, QUERY_COMPONENT_ENCODE_SET, false, false, true, true);
       removeAllCanonicalQueryParameters(nameToRemove);
       return this;
     }
@@ -848,7 +862,7 @@
       if (encodedName == null) throw new IllegalArgumentException("encodedName == null");
       if (encodedQueryNamesAndValues == null) return this;
       removeAllCanonicalQueryParameters(
-          canonicalize(encodedName, QUERY_COMPONENT_ENCODE_SET, true, true, true));
+          canonicalize(encodedName, QUERY_COMPONENT_ENCODE_SET, true, false, true, true));
       return this;
     }
 
@@ -867,14 +881,14 @@
 
     public Builder fragment(String fragment) {
       this.encodedFragment = fragment != null
-          ? canonicalize(fragment, FRAGMENT_ENCODE_SET, false, false, false)
+          ? canonicalize(fragment, FRAGMENT_ENCODE_SET, false, false, false, false)
           : null;
       return this;
     }
 
     public Builder encodedFragment(String encodedFragment) {
       this.encodedFragment = encodedFragment != null
-          ? canonicalize(encodedFragment, FRAGMENT_ENCODE_SET, true, false, false)
+          ? canonicalize(encodedFragment, FRAGMENT_ENCODE_SET, true, false, false, false)
           : null;
       return this;
     }
@@ -887,20 +901,20 @@
       for (int i = 0, size = encodedPathSegments.size(); i < size; i++) {
         String pathSegment = encodedPathSegments.get(i);
         encodedPathSegments.set(i,
-            canonicalize(pathSegment, PATH_SEGMENT_ENCODE_SET_URI, true, false, true));
+            canonicalize(pathSegment, PATH_SEGMENT_ENCODE_SET_URI, true, true, false, true));
       }
       if (encodedQueryNamesAndValues != null) {
         for (int i = 0, size = encodedQueryNamesAndValues.size(); i < size; i++) {
           String component = encodedQueryNamesAndValues.get(i);
           if (component != null) {
             encodedQueryNamesAndValues.set(i,
-                canonicalize(component, QUERY_COMPONENT_ENCODE_SET_URI, true, true, true));
+                canonicalize(component, QUERY_COMPONENT_ENCODE_SET_URI, true, true, true, true));
           }
         }
       }
       if (encodedFragment != null) {
         encodedFragment = canonicalize(
-            encodedFragment, FRAGMENT_ENCODE_SET_URI, true, false, false);
+            encodedFragment, FRAGMENT_ENCODE_SET_URI, true, true, false, false);
       }
       return this;
     }
@@ -1013,19 +1027,19 @@
                 int passwordColonOffset = delimiterOffset(
                     input, pos, componentDelimiterOffset, ":");
                 String canonicalUsername = canonicalize(
-                    input, pos, passwordColonOffset, USERNAME_ENCODE_SET, true, false, true);
+                    input, pos, passwordColonOffset, USERNAME_ENCODE_SET, true, false, false, true);
                 this.encodedUsername = hasUsername
                     ? this.encodedUsername + "%40" + canonicalUsername
                     : canonicalUsername;
                 if (passwordColonOffset != componentDelimiterOffset) {
                   hasPassword = true;
                   this.encodedPassword = canonicalize(input, passwordColonOffset + 1,
-                      componentDelimiterOffset, PASSWORD_ENCODE_SET, true, false, true);
+                      componentDelimiterOffset, PASSWORD_ENCODE_SET, true, false, false, true);
                 }
                 hasUsername = true;
               } else {
-                this.encodedPassword = this.encodedPassword + "%40" + canonicalize(
-                    input, pos, componentDelimiterOffset, PASSWORD_ENCODE_SET, true, false, true);
+                this.encodedPassword = this.encodedPassword + "%40" + canonicalize(input, pos,
+                    componentDelimiterOffset, PASSWORD_ENCODE_SET, true, false, false, true);
               }
               pos = componentDelimiterOffset + 1;
               break;
@@ -1072,14 +1086,14 @@
       if (pos < limit && input.charAt(pos) == '?') {
         int queryDelimiterOffset = delimiterOffset(input, pos, limit, "#");
         this.encodedQueryNamesAndValues = queryStringToNamesAndValues(canonicalize(
-            input, pos + 1, queryDelimiterOffset, QUERY_ENCODE_SET, true, true, true));
+            input, pos + 1, queryDelimiterOffset, QUERY_ENCODE_SET, true, false, true, true));
         pos = queryDelimiterOffset;
       }
 
       // Fragment.
       if (pos < limit && input.charAt(pos) == '#') {
         this.encodedFragment = canonicalize(
-            input, pos + 1, limit, FRAGMENT_ENCODE_SET, true, false, false);
+            input, pos + 1, limit, FRAGMENT_ENCODE_SET, true, false, false, false);
       }
 
       return ParseResult.SUCCESS;
@@ -1116,7 +1130,7 @@
     private void push(String input, int pos, int limit, boolean addTrailingSlash,
         boolean alreadyEncoded) {
       String segment = canonicalize(
-          input, pos, limit, PATH_SEGMENT_ENCODE_SET, alreadyEncoded, false, true);
+          input, pos, limit, PATH_SEGMENT_ENCODE_SET, alreadyEncoded, false, false, true);
       if (isDot(segment)) {
         return; // Skip '.' path segments.
       }
@@ -1467,7 +1481,7 @@
     private static int parsePort(String input, int pos, int limit) {
       try {
         // Canonicalize the port string to skip '\n' etc.
-        String portString = canonicalize(input, pos, limit, "", false, false, true);
+        String portString = canonicalize(input, pos, limit, "", false, false, false, true);
         int i = Integer.parseInt(portString);
         if (i > 0 && i <= 65535) return i;
         return -1;
@@ -1536,6 +1550,13 @@
     }
   }
 
+  static boolean percentEncoded(String encoded, int pos, int limit) {
+    return pos + 2 < limit
+        && encoded.charAt(pos) == '%'
+        && decodeHexDigit(encoded.charAt(pos + 1)) != -1
+        && decodeHexDigit(encoded.charAt(pos + 2)) != -1;
+  }
+
   static int decodeHexDigit(char c) {
     if (c >= '0' && c <= '9') return c - '0';
     if (c >= 'a' && c <= 'f') return c - 'a' + 10;
@@ -1555,24 +1576,26 @@
    * </ul>
    *
    * @param alreadyEncoded true to leave '%' as-is; false to convert it to '%25'.
+   * @param strict true to encode '%' if it is not the prefix of a valid percent encoding.
    * @param plusIsSpace true to encode '+' as "%2B" if it is not already encoded
    * @param asciiOnly true to encode all non-ASCII codepoints.
    */
   static String canonicalize(String input, int pos, int limit, String encodeSet,
-      boolean alreadyEncoded, boolean plusIsSpace, boolean asciiOnly) {
+      boolean alreadyEncoded, boolean strict, boolean plusIsSpace, boolean asciiOnly) {
     int codePoint;
     for (int i = pos; i < limit; i += Character.charCount(codePoint)) {
       codePoint = input.codePointAt(i);
       if (codePoint < 0x20
           || codePoint == 0x7f
-          || (codePoint >= 0x80 && asciiOnly)
+          || codePoint >= 0x80 && asciiOnly
           || encodeSet.indexOf(codePoint) != -1
-          || (codePoint == '%' && !alreadyEncoded)
-          || (codePoint == '+' && plusIsSpace)) {
+          || codePoint == '%' && (!alreadyEncoded || strict && !percentEncoded(input, i, limit))
+          || codePoint == '+' && plusIsSpace) {
         // Slow path: the character at i requires encoding!
         Buffer out = new Buffer();
         out.writeUtf8(input, pos, i);
-        canonicalize(out, input, i, limit, encodeSet, alreadyEncoded, plusIsSpace, asciiOnly);
+        canonicalize(out, input, i, limit, encodeSet, alreadyEncoded, strict, plusIsSpace,
+            asciiOnly);
         return out.readUtf8();
       }
     }
@@ -1581,8 +1604,8 @@
     return input.substring(pos, limit);
   }
 
-  static void canonicalize(Buffer out, String input, int pos, int limit,
-      String encodeSet, boolean alreadyEncoded, boolean plusIsSpace, boolean asciiOnly) {
+  static void canonicalize(Buffer out, String input, int pos, int limit, String encodeSet,
+      boolean alreadyEncoded, boolean strict, boolean plusIsSpace, boolean asciiOnly) {
     Buffer utf8Buffer = null; // Lazily allocated.
     int codePoint;
     for (int i = pos; i < limit; i += Character.charCount(codePoint)) {
@@ -1595,9 +1618,9 @@
         out.writeUtf8(alreadyEncoded ? "+" : "%2B");
       } else if (codePoint < 0x20
           || codePoint == 0x7f
-          || (codePoint >= 0x80 && asciiOnly)
+          || codePoint >= 0x80 && asciiOnly
           || encodeSet.indexOf(codePoint) != -1
-          || (codePoint == '%' && !alreadyEncoded)) {
+          || codePoint == '%' && (!alreadyEncoded || strict && !percentEncoded(input, i, limit))) {
         // Percent encode this character.
         if (utf8Buffer == null) {
           utf8Buffer = new Buffer();
@@ -1616,8 +1639,9 @@
     }
   }
 
-  static String canonicalize(
-      String input, String encodeSet, boolean alreadyEncoded, boolean query, boolean asciiOnly) {
-    return canonicalize(input, 0, input.length(), encodeSet, alreadyEncoded, query, asciiOnly);
+  static String canonicalize(String input, String encodeSet, boolean alreadyEncoded, boolean strict,
+      boolean plusIsSpace, boolean asciiOnly) {
+    return canonicalize(
+        input, 0, input.length(), encodeSet, alreadyEncoded, strict, plusIsSpace, asciiOnly);
   }
 }