@@ -162,6 +162,76 @@ void BackUpToPreviousSlash(int path_begin_in_output,
162
162
output->set_length (i + 1 );
163
163
}
164
164
165
+ // Looks for problematic nested escape sequences and escapes the output as
166
+ // needed to ensure they can't be misinterpreted.
167
+ //
168
+ // Our concern is that in input escape sequence that's invalid because it
169
+ // contains nested escape sequences might look valid once those are unescaped.
170
+ // For example, "%%300" is not a valid escape sequence, but after unescaping the
171
+ // inner "%30" this becomes "%00" which is valid. Leaving this in the output
172
+ // string can result in callers re-canonicalizing the string and unescaping this
173
+ // sequence, thus resulting in something fundamentally different than the
174
+ // original input here. This can cause a variety of problems.
175
+ //
176
+ // This function is called after we've just unescaped a sequence that's within
177
+ // two output characters of a previous '%' that we know didn't begin a valid
178
+ // escape sequence in the input string. We look for whether the output is going
179
+ // to turn into a valid escape sequence, and if so, convert the initial '%' into
180
+ // an escaped "%25" so the output can't be misinterpreted.
181
+ //
182
+ // |spec| is the input string we're canonicalizing.
183
+ // |next_input_index| is the index of the next unprocessed character in |spec|.
184
+ // |input_len| is the length of |spec|.
185
+ // |last_invalid_percent_index| is the index in |output| of a previously-seen
186
+ // '%' character. The caller knows this '%' character isn't followed by a valid
187
+ // escape sequence in the input string.
188
+ // |output| is the canonicalized output thus far. The caller guarantees this
189
+ // ends with a '%' followed by one or two characters, and the '%' is the one
190
+ // pointed to by |last_invalid_percent_index|. The last character in the string
191
+ // was just unescaped.
192
+ template <typename CHAR>
193
+ void CheckForNestedEscapes (const CHAR* spec,
194
+ int next_input_index,
195
+ int input_len,
196
+ int last_invalid_percent_index,
197
+ CanonOutput* output) {
198
+ const int length = output->length ();
199
+ const char last_unescaped_char = output->at (length - 1 );
200
+
201
+ // If |output| currently looks like "%c", we need to try appending the next
202
+ // input character to see if this will result in a problematic escape
203
+ // sequence. Note that this won't trigger on the first nested escape of a
204
+ // two-escape sequence like "%%30%30" -- we'll allow the conversion to
205
+ // "%0%30" -- but the second nested escape will be caught by this function
206
+ // when it's called again in that case.
207
+ const bool append_next_char = last_invalid_percent_index == length - 2 ;
208
+ if (append_next_char) {
209
+ // If the input doesn't contain a 7-bit character next, this case won't be a
210
+ // problem.
211
+ if ((next_input_index == input_len) || (spec[next_input_index] >= 0x80 ))
212
+ return ;
213
+ output->push_back (static_cast <char >(spec[next_input_index]));
214
+ }
215
+
216
+ // Now output ends like "%cc". Try to unescape this.
217
+ int begin = last_invalid_percent_index;
218
+ unsigned char temp;
219
+ if (DecodeEscaped (output->data (), &begin, output->length (), &temp)) {
220
+ // New escape sequence found. Overwrite the characters following the '%'
221
+ // with "25", and push_back() the one or two characters that were following
222
+ // the '%' when we were called.
223
+ if (!append_next_char)
224
+ output->push_back (output->at (last_invalid_percent_index + 1 ));
225
+ output->set (last_invalid_percent_index + 1 , ' 2' );
226
+ output->set (last_invalid_percent_index + 2 , ' 5' );
227
+ output->push_back (last_unescaped_char);
228
+ } else if (append_next_char) {
229
+ // Not a valid escape sequence, but we still need to undo appending the next
230
+ // source character so the caller can process it normally.
231
+ output->set_length (length);
232
+ }
233
+ }
234
+
165
235
// Appends the given path to the output. It assumes that if the input path
166
236
// starts with a slash, it should be copied to the output. If no path has
167
237
// already been appended to the output (the case when not resolving
@@ -182,10 +252,15 @@ bool DoPartialPath(const CHAR* spec,
182
252
CanonOutput* output) {
183
253
int end = path.end ();
184
254
255
+ // We use this variable to minimize the amount of work done when unescaping --
256
+ // we'll only call CheckForNestedEscapes() when this points at one of the last
257
+ // couple of characters in |output|.
258
+ int last_invalid_percent_index = INT_MIN;
259
+
185
260
bool success = true ;
186
261
for (int i = path.begin ; i < end; i++) {
187
262
UCHAR uch = static_cast <UCHAR>(spec[i]);
188
- if (sizeof (CHAR) > sizeof ( char ) && uch >= 0x80 ) {
263
+ if (sizeof (CHAR) > 1 && uch >= 0x80 ) {
189
264
// We only need to test wide input for having non-ASCII characters. For
190
265
// narrow input, we'll always just use the lookup table. We don't try to
191
266
// do anything tricky with decoding/validating UTF-8. This function will
@@ -245,33 +320,40 @@ bool DoPartialPath(const CHAR* spec,
245
320
unsigned char unescaped_value;
246
321
if (DecodeEscaped (spec, &i, end, &unescaped_value)) {
247
322
// Valid escape sequence, see if we keep, reject, or unescape it.
323
+ // Note that at this point DecodeEscape() will have advanced |i| to
324
+ // the last character of the escape sequence.
248
325
char unescaped_flags = kPathCharLookup [unescaped_value];
249
326
250
327
if (unescaped_flags & UNESCAPE) {
251
- // This escaped value shouldn't be escaped, copy it.
328
+ // This escaped value shouldn't be escaped. Try to copy it.
252
329
output->push_back (unescaped_value);
253
- } else if (unescaped_flags & INVALID_BIT) {
254
- // Invalid escaped character, copy it and remember the error.
255
- output->push_back (' %' );
256
- output->push_back (static_cast <char >(spec[i - 1 ]));
257
- output->push_back (static_cast <char >(spec[i]));
258
- success = false ;
330
+ // If we just unescaped a value within 2 output characters of the
331
+ // '%' from a previously-detected invalid escape sequence, we
332
+ // might have an input string with problematic nested escape
333
+ // sequences; detect and fix them.
334
+ if (last_invalid_percent_index >= (output->length () - 3 )) {
335
+ CheckForNestedEscapes (spec, i + 1 , end,
336
+ last_invalid_percent_index, output);
337
+ }
259
338
} else {
260
- // Valid escaped character but we should keep it escaped. We
261
- // don't want to change the case of any hex letters in case
262
- // the server is sensitive to that, so we just copy the two
263
- // characters without checking (DecodeEscape will have advanced
264
- // to the last character of the pair) .
339
+ // Either this is an invalid escaped character, or it's a valid
340
+ // escaped character we should keep escaped. In the first case we
341
+ // should just copy it exactly and remember the error. In the
342
+ // second we also copy exactly in case the server is sensitive to
343
+ // changing the case of any hex letters .
265
344
output->push_back (' %' );
266
345
output->push_back (static_cast <char >(spec[i - 1 ]));
267
346
output->push_back (static_cast <char >(spec[i]));
347
+ if (unescaped_flags & INVALID_BIT)
348
+ success = false ;
268
349
}
269
350
} else {
270
- // Invalid escape sequence. IE7 rejects any URLs with such
271
- // sequences, while Firefox, IE6, and Safari all pass it through
272
- // unchanged. We are more permissive unlike IE7. I don't think this
273
- // can cause significant problems, if it does, we should change
274
- // to be more like IE7.
351
+ // Invalid escape sequence. IE7+ rejects any URLs with such
352
+ // sequences, while other browsers pass them through unchanged. We
353
+ // use the permissive behavior.
354
+ // TODO(brettw): Consider testing IE's strict behavior, which would
355
+ // allow removing the code to handle nested escapes above.
356
+ last_invalid_percent_index = output->length ();
275
357
output->push_back (' %' );
276
358
}
277
359
0 commit comments