@@ -104,6 +104,7 @@ fmtIdEnc(const char *rawid, int encoding)
104104
105105 const char * cp ;
106106 bool need_quotes = false;
107+ size_t remaining = strlen (rawid );
107108
108109 /*
109110 * These checks need to match the identifier production in scan.l. Don't
@@ -117,7 +118,8 @@ fmtIdEnc(const char *rawid, int encoding)
117118 else
118119 {
119120 /* otherwise check the entire string */
120- for (cp = rawid ; * cp ; cp ++ )
121+ cp = rawid ;
122+ for (size_t i = 0 ; i < remaining ; i ++ , cp ++ )
121123 {
122124 if (!((* cp >= 'a' && * cp <= 'z' )
123125 || (* cp >= '0' && * cp <= '9' )
@@ -153,17 +155,90 @@ fmtIdEnc(const char *rawid, int encoding)
153155 else
154156 {
155157 appendPQExpBufferChar (id_return , '"' );
156- for (cp = rawid ; * cp ; cp ++ )
158+
159+ cp = & rawid [0 ];
160+ while (remaining > 0 )
157161 {
158- /*
159- * Did we find a double-quote in the string? Then make this a
160- * double double-quote per SQL99. Before, we put in a
161- * backslash/double-quote pair. - thomas 2000-08-05
162- */
163- if (* cp == '"' )
164- appendPQExpBufferChar (id_return , '"' );
165- appendPQExpBufferChar (id_return , * cp );
162+ int charlen ;
163+
164+ /* Fast path for plain ASCII */
165+ if (!IS_HIGHBIT_SET (* cp ))
166+ {
167+ /*
168+ * Did we find a double-quote in the string? Then make this a
169+ * double double-quote per SQL99. Before, we put in a
170+ * backslash/double-quote pair. - thomas 2000-08-05
171+ */
172+ if (* cp == '"' )
173+ appendPQExpBufferChar (id_return , '"' );
174+ appendPQExpBufferChar (id_return , * cp );
175+ remaining -- ;
176+ cp ++ ;
177+ continue ;
178+ }
179+
180+ /* Slow path for possible multibyte characters */
181+ charlen = pg_encoding_mblen (encoding , cp );
182+
183+ if (remaining < charlen )
184+ {
185+ /*
186+ * If the character is longer than the available input,
187+ * replace the string with an invalid sequence. The invalid
188+ * sequence ensures that the escaped string will trigger an
189+ * error on the server-side, even if we can't directly report
190+ * an error here.
191+ */
192+ enlargePQExpBuffer (id_return , 2 );
193+ pg_encoding_set_invalid (encoding ,
194+ id_return -> data + id_return -> len );
195+ id_return -> len += 2 ;
196+ id_return -> data [id_return -> len ] = '\0' ;
197+
198+ /* there's no more input data, so we can stop */
199+ break ;
200+ }
201+ else if (pg_encoding_verifymbchar (encoding , cp , charlen ) == -1 )
202+ {
203+ /*
204+ * Multibyte character is invalid. It's important to verify
205+ * that as invalid multi-byte characters could e.g. be used to
206+ * "skip" over quote characters, e.g. when parsing
207+ * character-by-character.
208+ *
209+ * Replace the bytes corresponding to the invalid character
210+ * with an invalid sequence, for the same reason as above.
211+ *
212+ * It would be a bit faster to verify the whole string the
213+ * first time we encounter a set highbit, but this way we can
214+ * replace just the invalid characters, which probably makes
215+ * it easier for users to find the invalidly encoded portion
216+ * of a larger string.
217+ */
218+ enlargePQExpBuffer (id_return , 2 );
219+ pg_encoding_set_invalid (encoding ,
220+ id_return -> data + id_return -> len );
221+ id_return -> len += 2 ;
222+ id_return -> data [id_return -> len ] = '\0' ;
223+
224+ /*
225+ * Copy the rest of the string after the invalid multi-byte
226+ * character.
227+ */
228+ remaining -= charlen ;
229+ cp += charlen ;
230+ }
231+ else
232+ {
233+ for (int i = 0 ; i < charlen ; i ++ )
234+ {
235+ appendPQExpBufferChar (id_return , * cp );
236+ remaining -- ;
237+ cp ++ ;
238+ }
239+ }
166240 }
241+
167242 appendPQExpBufferChar (id_return , '"' );
168243 }
169244
@@ -290,17 +365,18 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
290365 size_t length = strlen (str );
291366 const char * source = str ;
292367 char * target ;
368+ size_t remaining = length ;
293369
294370 if (!enlargePQExpBuffer (buf , 2 * length + 2 ))
295371 return ;
296372
297373 target = buf -> data + buf -> len ;
298374 * target ++ = '\'' ;
299375
300- while (* source != '\0' )
376+ while (remaining > 0 )
301377 {
302378 char c = * source ;
303- int len ;
379+ int charlen ;
304380 int i ;
305381
306382 /* Fast path for plain ASCII */
@@ -312,39 +388,65 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
312388 /* Copy the character */
313389 * target ++ = c ;
314390 source ++ ;
391+ remaining -- ;
315392 continue ;
316393 }
317394
318395 /* Slow path for possible multibyte characters */
319- len = PQmblen (source , encoding );
396+ charlen = PQmblen (source , encoding );
320397
321- /* Copy the character */
322- for (i = 0 ; i < len ; i ++ )
398+ if (remaining < charlen )
323399 {
324- if (* source == '\0' )
325- break ;
326- * target ++ = * source ++ ;
327- }
400+ /*
401+ * If the character is longer than the available input, replace
402+ * the string with an invalid sequence. The invalid sequence
403+ * ensures that the escaped string will trigger an error on the
404+ * server-side, even if we can't directly report an error here.
405+ *
406+ * We know there's enough space for the invalid sequence because
407+ * the "target" buffer is 2 * length + 2 long, and at worst we're
408+ * replacing a single input byte with two invalid bytes.
409+ */
410+ pg_encoding_set_invalid (encoding , target );
411+ target += 2 ;
328412
329- /*
330- * If we hit premature end of string (ie, incomplete multibyte
331- * character), try to pad out to the correct length with spaces. We
332- * may not be able to pad completely, but we will always be able to
333- * insert at least one pad space (since we'd not have quoted a
334- * multibyte character). This should be enough to make a string that
335- * the server will error out on.
336- */
337- if (i < len )
413+ /* there's no more valid input data, so we can stop */
414+ break ;
415+ }
416+ else if (pg_encoding_verifymbchar (encoding , source , charlen ) == -1 )
338417 {
339- char * stop = buf -> data + buf -> maxlen - 2 ;
418+ /*
419+ * Multibyte character is invalid. It's important to verify that
420+ * as invalid multi-byte characters could e.g. be used to "skip"
421+ * over quote characters, e.g. when parsing
422+ * character-by-character.
423+ *
424+ * Replace the bytes corresponding to the invalid character with
425+ * an invalid sequence, for the same reason as above.
426+ *
427+ * It would be a bit faster to verify the whole string the first
428+ * time we encounter a set highbit, but this way we can replace
429+ * just the invalid characters, which probably makes it easier for
430+ * users to find the invalidly encoded portion of a larger string.
431+ */
432+ pg_encoding_set_invalid (encoding , target );
433+ target += 2 ;
434+ remaining -= charlen ;
340435
341- for (; i < len ; i ++ )
436+ /*
437+ * Copy the rest of the string after the invalid multi-byte
438+ * character.
439+ */
440+ source += charlen ;
441+ }
442+ else
443+ {
444+ /* Copy the character */
445+ for (i = 0 ; i < charlen ; i ++ )
342446 {
343- if (target >= stop )
344- break ;
345- * target ++ = ' ' ;
447+ * target ++ = * source ++ ;
448+ remaining -- ;
346449 }
347- break ;
348450 }
349451 }
350452
0 commit comments