@@ -81,24 +81,6 @@ typedef std::unique_ptr<StreamWriter> StreamWriterPtr;
81
81
typedef std::auto_ptr<StreamWriter> StreamWriterPtr;
82
82
#endif
83
83
84
- static bool containsControlCharacter (const char * str) {
85
- while (*str) {
86
- if (isControlCharacter (*(str++)))
87
- return true ;
88
- }
89
- return false ;
90
- }
91
-
92
- static bool containsControlCharacter0 (const char * str, unsigned len) {
93
- char const * end = str + len;
94
- while (end != str) {
95
- if (isControlCharacter (*str) || 0 ==*str)
96
- return true ;
97
- ++str;
98
- }
99
- return false ;
100
- }
101
-
102
84
JSONCPP_STRING valueToString (LargestInt value) {
103
85
UIntToStringBuffer buffer;
104
86
char * current = buffer + sizeof (buffer);
@@ -176,89 +158,103 @@ JSONCPP_STRING valueToString(double value) { return valueToString(value, false,
176
158
177
159
JSONCPP_STRING valueToString (bool value) { return value ? " true" : " false" ; }
178
160
179
- JSONCPP_STRING valueToQuotedString (const char * value) {
180
- if (value == NULL )
181
- return " " ;
182
- // Not sure how to handle unicode...
183
- if (strpbrk (value, " \"\\\b\f\n\r\t " ) == NULL &&
184
- !containsControlCharacter (value))
185
- return JSONCPP_STRING (" \" " ) + value + " \" " ;
186
- // We have to walk value and escape any special characters.
187
- // Appending to JSONCPP_STRING is not efficient, but this should be rare.
188
- // (Note: forward slashes are *not* rare, but I am not escaping them.)
189
- JSONCPP_STRING::size_type maxsize =
190
- strlen (value) * 2 + 3 ; // allescaped+quotes+NULL
191
- JSONCPP_STRING result;
192
- result.reserve (maxsize); // to avoid lots of mallocs
193
- result += " \" " ;
194
- for (const char * c = value; *c != 0 ; ++c) {
195
- switch (*c) {
196
- case ' \" ' :
197
- result += " \\\" " ;
198
- break ;
199
- case ' \\ ' :
200
- result += " \\\\ " ;
201
- break ;
202
- case ' \b ' :
203
- result += " \\ b" ;
204
- break ;
205
- case ' \f ' :
206
- result += " \\ f" ;
207
- break ;
208
- case ' \n ' :
209
- result += " \\ n" ;
210
- break ;
211
- case ' \r ' :
212
- result += " \\ r" ;
213
- break ;
214
- case ' \t ' :
215
- result += " \\ t" ;
216
- break ;
217
- // case '/':
218
- // Even though \/ is considered a legal escape in JSON, a bare
219
- // slash is also legal, so I see no reason to escape it.
220
- // (I hope I am not misunderstanding something.
221
- // blep notes: actually escaping \/ may be useful in javascript to avoid </
222
- // sequence.
223
- // Should add a flag to allow this compatibility mode and prevent this
224
- // sequence from occurring.
225
- default :
226
- if (isControlCharacter (*c)) {
227
- JSONCPP_OSTRINGSTREAM oss;
228
- oss << " \\ u" << std::hex << std::uppercase << std::setfill (' 0' )
229
- << std::setw (4 ) << static_cast <int >(*c);
230
- result += oss.str ();
231
- } else {
232
- result += *c;
233
- }
234
- break ;
235
- }
161
+ static bool isAnyCharRequiredQuoting (char const * s, size_t n) {
162
+ assert (s || !n);
163
+
164
+ char const * const end = s + n;
165
+ for (char const * cur = s; cur < end; ++cur) {
166
+ if (*cur == ' \\ ' || *cur == ' \" ' || *cur < ' '
167
+ || static_cast <unsigned char >(*cur) < 0x80 )
168
+ return true ;
236
169
}
237
- result += " \" " ;
238
- return result;
170
+ return false ;
239
171
}
240
172
241
- // https://github.com/upcaste/upcaste/blob/master/src/upcore/src/cstring/strnpbrk.cpp
242
- static char const * strnpbrk (char const * s, char const * accept, size_t n) {
243
- assert ((s || !n) && accept);
173
+ static unsigned int utf8ToCodepoint (const char *& s, const char * e) {
174
+ const unsigned int REPLACEMENT_CHARACTER = 0xFFFD ;
244
175
245
- char const * const end = s + n;
246
- for (char const * cur = s; cur < end; ++cur) {
247
- int const c = *cur;
248
- for (char const * a = accept; *a; ++a) {
249
- if (*a == c) {
250
- return cur;
251
- }
252
- }
176
+ unsigned int firstByte = static_cast <unsigned char >(*s);
177
+
178
+ if (firstByte < 0x80 )
179
+ return firstByte;
180
+
181
+ if (firstByte < 0xE0 ) {
182
+ if (e - s < 2 )
183
+ return REPLACEMENT_CHARACTER;
184
+
185
+ unsigned int calculated = ((firstByte & 0x1F ) << 6 )
186
+ | (static_cast <unsigned int >(s[1 ]) & 0x3F );
187
+ s += 1 ;
188
+ // oversized encoded characters are invalid
189
+ return calculated < 0x80 ? REPLACEMENT_CHARACTER : calculated;
253
190
}
254
- return NULL ;
191
+
192
+ if (firstByte < 0xF0 ) {
193
+ if (e - s < 3 )
194
+ return REPLACEMENT_CHARACTER;
195
+
196
+ unsigned int calculated = ((firstByte & 0x0F ) << 12 )
197
+ | ((static_cast <unsigned int >(s[1 ]) & 0x3F ) << 6 )
198
+ | (static_cast <unsigned int >(s[2 ]) & 0x3F );
199
+ s += 2 ;
200
+ // surrogates aren't valid codepoints itself
201
+ // shouldn't be UTF-8 encoded
202
+ if (calculated >= 0xD800 && calculated >= 0xDFFF )
203
+ return REPLACEMENT_CHARACTER;
204
+ // oversized encoded characters are invalid
205
+ return calculated < 0x800 ? REPLACEMENT_CHARACTER : calculated;
206
+ }
207
+
208
+ if (firstByte < 0xF8 ) {
209
+ if (e - s < 4 )
210
+ return REPLACEMENT_CHARACTER;
211
+
212
+ unsigned int calculated = ((firstByte & 0x07 ) << 24 )
213
+ | ((static_cast <unsigned int >(s[1 ]) & 0x3F ) << 12 )
214
+ | ((static_cast <unsigned int >(s[2 ]) & 0x3F ) << 6 )
215
+ | (static_cast <unsigned int >(s[3 ]) & 0x3F );
216
+ s += 3 ;
217
+ // oversized encoded characters are invalid
218
+ return calculated < 0x10000 ? REPLACEMENT_CHARACTER : calculated;
219
+ }
220
+
221
+ return REPLACEMENT_CHARACTER;
255
222
}
223
+
224
+ static const char hex2[] =
225
+ " 000102030405060708090a0b0c0d0e0f"
226
+ " 101112131415161718191a1b1c1d1e1f"
227
+ " 202122232425262728292a2b2c2d2e2f"
228
+ " 303132333435363738393a3b3c3d3e3f"
229
+ " 404142434445464748494a4b4c4d4e4f"
230
+ " 505152535455565758595a5b5c5d5e5f"
231
+ " 606162636465666768696a6b6c6d6e6f"
232
+ " 707172737475767778797a7b7c7d7e7f"
233
+ " 808182838485868788898a8b8c8d8e8f"
234
+ " 909192939495969798999a9b9c9d9e9f"
235
+ " a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
236
+ " b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
237
+ " c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
238
+ " d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
239
+ " e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
240
+ " f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff" ;
241
+
242
+ static JSONCPP_STRING toHex16Bit (unsigned int x) {
243
+ const unsigned int hi = (x >> 8 ) & 0xff ;
244
+ const unsigned int lo = x & 0xff ;
245
+ JSONCPP_STRING result (4 , ' ' );
246
+ result[0 ] = hex2[2 * hi];
247
+ result[1 ] = hex2[2 * hi + 1 ];
248
+ result[2 ] = hex2[2 * lo];
249
+ result[3 ] = hex2[2 * lo + 1 ];
250
+ return result;
251
+ }
252
+
256
253
static JSONCPP_STRING valueToQuotedStringN (const char * value, unsigned length) {
257
254
if (value == NULL )
258
255
return " " ;
259
- // Not sure how to handle unicode...
260
- if (strnpbrk (value, " \"\\\b\f\n\r\t " , length) == NULL &&
261
- !containsControlCharacter0 (value, length))
256
+
257
+ if (!isAnyCharRequiredQuoting (value, length))
262
258
return JSONCPP_STRING (" \" " ) + value + " \" " ;
263
259
// We have to walk value and escape any special characters.
264
260
// Appending to JSONCPP_STRING is not efficient, but this should be rare.
@@ -300,14 +296,24 @@ static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) {
300
296
// sequence.
301
297
// Should add a flag to allow this compatibility mode and prevent this
302
298
// sequence from occurring.
303
- default :
304
- if ((isControlCharacter (*c)) || (*c == 0 )) {
305
- JSONCPP_OSTRINGSTREAM oss;
306
- oss << " \\ u" << std::hex << std::uppercase << std::setfill (' 0' )
307
- << std::setw (4 ) << static_cast <int >(*c);
308
- result += oss.str ();
309
- } else {
310
- result += *c;
299
+ default : {
300
+ unsigned int cp = utf8ToCodepoint (c, end);
301
+ // don't escape non-control characters
302
+ // (short escape sequence are applied above)
303
+ if (cp < 0x80 && cp >= 0x20 )
304
+ result += static_cast <char >(cp);
305
+ else if (cp < 0x10000 ) { // codepoint is in Basic Multilingual Plane
306
+ result += " \\ u" ;
307
+ result += toHex16Bit (cp);
308
+ }
309
+ else { // codepoint is not in Basic Multilingual Plane
310
+ // convert to surrogate pair first
311
+ cp -= 0x10000 ;
312
+ result += " \\ u" ;
313
+ result += toHex16Bit ((cp >> 10 ) + 0xD800 );
314
+ result += " \\ u" ;
315
+ result += toHex16Bit ((cp & 0x3FF ) + 0xDC00 );
316
+ }
311
317
}
312
318
break ;
313
319
}
@@ -316,6 +322,10 @@ static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) {
316
322
return result;
317
323
}
318
324
325
+ JSONCPP_STRING valueToQuotedString (const char * value) {
326
+ return valueToQuotedStringN (value, static_cast <unsigned int >(strlen (value)));
327
+ }
328
+
319
329
// Class Writer
320
330
// //////////////////////////////////////////////////////////////////
321
331
Writer::~Writer () {}
0 commit comments