build on aferreira improvements to handle empty parameters with no =, and improve component decoding logic

kimchy · kimchy · commit fb46f6b0a506 · 2011-01-07T14:46:14.000+02:00
diff --git a/.idea/dictionaries/kimchy.xml b/.idea/dictionaries/kimchy.xml
diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/rest/support/RestUtils.java b/modules/elasticsearch/src/main/java/org/elasticsearch/rest/support/RestUtils.java
@@ -19,53 +19,181 @@
 
 package org.elasticsearch.rest.support;
 
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
-import java.nio.charset.UnsupportedCharsetException;
+import org.elasticsearch.common.base.Charsets;
+
+import java.nio.charset.Charset;
 import java.util.Map;
 
 /**
  * @author kimchy (shay.banon)
  */
 public class RestUtils {
 
-    public static void decodeQueryString(String queryString, int fromIndex, Map<String, String> params) {
+    public static void decodeQueryString(String s, int fromIndex, Map<String, String> params) {
         if (fromIndex < 0) {
             return;
         }
-        if (fromIndex >= queryString.length()) {
+        if (fromIndex >= s.length()) {
             return;
         }
-        int toIndex;
-        while ((toIndex = queryString.indexOf('&', fromIndex)) >= 0) {
-            int idx = queryString.indexOf('=', fromIndex);
-            if (fromIndex < idx && idx < toIndex) {
-                params.put(decodeComponent(queryString.substring(fromIndex, idx)), decodeComponent(queryString.substring(idx + 1, toIndex)));
+
+        String name = null;
+        int pos = fromIndex; // Beginning of the unprocessed region
+        int i;       // End of the unprocessed region
+        char c = 0;  // Current character
+        for (i = fromIndex; i < s.length(); i++) {
+            c = s.charAt(i);
+            if (c == '=' && name == null) {
+                if (pos != i) {
+                    name = decodeComponent(s.substring(pos, i));
+                }
+                pos = i + 1;
+            } else if (c == '&') {
+                if (name == null && pos != i) {
+                    // We haven't seen an `=' so far but moved forward.
+                    // Must be a param of the form '&a&' so add it with
+                    // an empty value.
+                    addParam(params, decodeComponent(s.substring(pos, i)), "");
+                } else if (name != null) {
+                    addParam(params, name, decodeComponent(s.substring(pos, i)));
+                    name = null;
+                }
+                pos = i + 1;
             }
-            fromIndex = toIndex + 1;
         }
-        int idx = queryString.indexOf('=', fromIndex);
-        if (idx < 0) {
-            return;
+
+        if (pos != i) {  // Are there characters we haven't dealt with?
+            if (name == null) {     // Yes and we haven't seen any `='.
+                addParam(params, decodeComponent(s.substring(pos, i)), "");
+            } else {                // Yes and this must be the last value.
+                addParam(params, name, decodeComponent(s.substring(pos, i)));
+            }
+        } else if (name != null) {  // Have we seen a name without value?
+            addParam(params, name, "");
         }
-        params.put(decodeComponent(queryString.substring(fromIndex, idx)), decodeComponent(queryString.substring(idx + 1)));
     }
 
-    public static String decodeComponent(String s) {
+    private static void addParam(Map<String, String> params, String name, String value) {
+        params.put(name, value);
+    }
+
+    /**
+     * Decodes a bit of an URL encoded by a browser.
+     * <p>
+     * This is equivalent to calling {@link #decodeComponent(String, Charset)}
+     * with the UTF-8 charset (recommended to comply with RFC 3986, Section 2).
+     *
+     * @param s The string to decode (can be empty).
+     * @return The decoded string, or {@code s} if there's nothing to decode.
+     *         If the string to decode is {@code null}, returns an empty string.
+     * @throws IllegalArgumentException if the string contains a malformed
+     *                                  escape sequence.
+     */
+    public static String decodeComponent(final String s) {
+        return decodeComponent(s, Charsets.UTF_8);
+    }
+
+    /**
+     * Decodes a bit of an URL encoded by a browser.
+     * <p>
+     * The string is expected to be encoded as per RFC 3986, Section 2.
+     * This is the encoding used by JavaScript functions {@code encodeURI}
+     * and {@code encodeURIComponent}, but not {@code escape}.  For example
+     * in this encoding, &eacute; (in Unicode {@code U+00E9} or in UTF-8
+     * {@code 0xC3 0xA9}) is encoded as {@code %C3%A9} or {@code %c3%a9}.
+     * <p>
+     * This is essentially equivalent to calling
+     * <code>{@link java.net.URLDecoder URLDecoder}.{@link
+     * java.net.URLDecoder#decode(String, String)}</code>
+     * except that it's over 2x faster and generates less garbage for the GC.
+     * Actually this function doesn't allocate any memory if there's nothing
+     * to decode, the argument itself is returned.
+     *
+     * @param s       The string to decode (can be empty).
+     * @param charset The charset to use to decode the string (should really
+     *                be {@link Charsets#UTF_8}.
+     * @return The decoded string, or {@code s} if there's nothing to decode.
+     *         If the string to decode is {@code null}, returns an empty string.
+     * @throws IllegalArgumentException if the string contains a malformed
+     *                                  escape sequence.
+     */
+    @SuppressWarnings("fallthrough")
+    public static String decodeComponent(final String s, final Charset charset) {
         if (s == null) {
             return "";
         }
-        int numChars = s.length();
-        for (int i = 0; i < numChars; i++) {
-            // do an initial check if it requires decoding do it and return
-            if (s.charAt(i) == '+' || s.charAt(i) == '%') {
-                try {
-                    return URLDecoder.decode(s, "UTF8");
-                } catch (UnsupportedEncodingException e) {
-                    throw new UnsupportedCharsetException("UTF8");
-                }
+        final int size = s.length();
+        boolean modified = false;
+        for (int i = 0; i < size; i++) {
+            final char c = s.charAt(i);
+            switch (c) {
+                case '%':
+                    i++;  // We can skip at least one char, e.g. `%%'.
+                    // Fall through.
+                case '+':
+                    modified = true;
+                    break;
             }
         }
-        return s;
+        if (!modified) {
+            return s;
+        }
+        final byte[] buf = new byte[size];
+        int pos = 0;  // position in `buf'.
+        for (int i = 0; i < size; i++) {
+            char c = s.charAt(i);
+            switch (c) {
+                case '+':
+                    buf[pos++] = ' ';  // "+" -> " "
+                    break;
+                case '%':
+                    if (i == size - 1) {
+                        throw new IllegalArgumentException("unterminated escape"
+                                + " sequence at end of string: " + s);
+                    }
+                    c = s.charAt(++i);
+                    if (c == '%') {
+                        buf[pos++] = '%';  // "%%" -> "%"
+                        break;
+                    } else if (i == size - 1) {
+                        throw new IllegalArgumentException("partial escape"
+                                + " sequence at end of string: " + s);
+                    }
+                    c = decodeHexNibble(c);
+                    final char c2 = decodeHexNibble(s.charAt(++i));
+                    if (c == Character.MAX_VALUE || c2 == Character.MAX_VALUE) {
+                        throw new IllegalArgumentException(
+                                "invalid escape sequence `%" + s.charAt(i - 1)
+                                        + s.charAt(i) + "' at index " + (i - 2)
+                                        + " of: " + s);
+                    }
+                    c = (char) (c * 16 + c2);
+                    // Fall through.
+                default:
+                    buf[pos++] = (byte) c;
+                    break;
+            }
+        }
+        return new String(buf, 0, pos, charset);
+    }
+
+    /**
+     * Helper to decode half of a hexadecimal number from a string.
+     *
+     * @param c The ASCII character of the hexadecimal number to decode.
+     *          Must be in the range {@code [0-9a-fA-F]}.
+     * @return The hexadecimal value represented in the ASCII character
+     *         given, or {@link Character#MAX_VALUE} if the character is invalid.
+     */
+    private static char decodeHexNibble(final char c) {
+        if ('0' <= c && c <= '9') {
+            return (char) (c - '0');
+        } else if ('a' <= c && c <= 'f') {
+            return (char) (c - 'a' + 10);
+        } else if ('A' <= c && c <= 'F') {
+            return (char) (c - 'A' + 10);
+        } else {
+            return Character.MAX_VALUE;
+        }
     }
 }
diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/rest/util/RestUtilsTests.java b/modules/elasticsearch/src/test/java/org/elasticsearch/rest/util/RestUtilsTests.java
@@ -84,37 +84,40 @@ public void testDecodeQueryStringEdgeCases() {
         params.clear();
         uri = "something?=";
         RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params);
-        assertThat(params.size(), equalTo(1));
-        assertThat(params.get(""), equalTo(""));
+        assertThat(params.size(), equalTo(0));
 
         params.clear();
         uri = "something?&=";
         RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params);
-        assertThat(params.size(), equalTo(1));
-        assertThat(params.get(""), equalTo(""));
+        assertThat(params.size(), equalTo(0));
 
         params.clear();
         uri = "something?a";
         RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params);
-        assertThat(params.size(), equalTo(0));
+        assertThat(params.size(), equalTo(1));
+        assertThat(params.get("a"), equalTo(""));
 
         params.clear();
         uri = "something?p=v&a";
         RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params);
-        assertThat(params.size(), equalTo(1));
+        assertThat(params.size(), equalTo(2));
+        assertThat(params.get("a"), equalTo(""));
         assertThat(params.get("p"), equalTo("v"));
 
         params.clear();
         uri = "something?p=v&a&p1=v1";
         RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params);
-        assertThat(params.size(), equalTo(2));
+        assertThat(params.size(), equalTo(3));
+        assertThat(params.get("a"), equalTo(""));
         assertThat(params.get("p"), equalTo("v"));
         assertThat(params.get("p1"), equalTo("v1"));
 
         params.clear();
         uri = "something?p=v&a&b&p1=v1";
         RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params);
-        assertThat(params.size(), equalTo(2));
+        assertThat(params.size(), equalTo(4));
+        assertThat(params.get("a"), equalTo(""));
+        assertThat(params.get("b"), equalTo(""));
         assertThat(params.get("p"), equalTo("v"));
         assertThat(params.get("p1"), equalTo("v1"));
     }