Skip to content

Commit 42a161f

Browse files
pkierskicdunn2001
authored andcommitted
Serialize UTF-8 string with Unicode escapes (open-source-parsers#687)
Squashed and merged.
1 parent a3a4059 commit 42a161f

File tree

1 file changed

+110
-100
lines changed

1 file changed

+110
-100
lines changed

src/lib_json/json_writer.cpp

Lines changed: 110 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -81,24 +81,6 @@ typedef std::unique_ptr<StreamWriter> StreamWriterPtr;
8181
typedef std::auto_ptr<StreamWriter> StreamWriterPtr;
8282
#endif
8383

84-
static bool containsControlCharacter(const char* str) {
85-
while (*str) {
86-
if (isControlCharacter(*(str++)))
87-
return true;
88-
}
89-
return false;
90-
}
91-
92-
static bool containsControlCharacter0(const char* str, unsigned len) {
93-
char const* end = str + len;
94-
while (end != str) {
95-
if (isControlCharacter(*str) || 0==*str)
96-
return true;
97-
++str;
98-
}
99-
return false;
100-
}
101-
10284
JSONCPP_STRING valueToString(LargestInt value) {
10385
UIntToStringBuffer buffer;
10486
char* current = buffer + sizeof(buffer);
@@ -176,89 +158,103 @@ JSONCPP_STRING valueToString(double value) { return valueToString(value, false,
176158

177159
JSONCPP_STRING valueToString(bool value) { return value ? "true" : "false"; }
178160

179-
JSONCPP_STRING valueToQuotedString(const char* value) {
180-
if (value == NULL)
181-
return "";
182-
// Not sure how to handle unicode...
183-
if (strpbrk(value, "\"\\\b\f\n\r\t") == NULL &&
184-
!containsControlCharacter(value))
185-
return JSONCPP_STRING("\"") + value + "\"";
186-
// We have to walk value and escape any special characters.
187-
// Appending to JSONCPP_STRING is not efficient, but this should be rare.
188-
// (Note: forward slashes are *not* rare, but I am not escaping them.)
189-
JSONCPP_STRING::size_type maxsize =
190-
strlen(value) * 2 + 3; // allescaped+quotes+NULL
191-
JSONCPP_STRING result;
192-
result.reserve(maxsize); // to avoid lots of mallocs
193-
result += "\"";
194-
for (const char* c = value; *c != 0; ++c) {
195-
switch (*c) {
196-
case '\"':
197-
result += "\\\"";
198-
break;
199-
case '\\':
200-
result += "\\\\";
201-
break;
202-
case '\b':
203-
result += "\\b";
204-
break;
205-
case '\f':
206-
result += "\\f";
207-
break;
208-
case '\n':
209-
result += "\\n";
210-
break;
211-
case '\r':
212-
result += "\\r";
213-
break;
214-
case '\t':
215-
result += "\\t";
216-
break;
217-
// case '/':
218-
// Even though \/ is considered a legal escape in JSON, a bare
219-
// slash is also legal, so I see no reason to escape it.
220-
// (I hope I am not misunderstanding something.
221-
// blep notes: actually escaping \/ may be useful in javascript to avoid </
222-
// sequence.
223-
// Should add a flag to allow this compatibility mode and prevent this
224-
// sequence from occurring.
225-
default:
226-
if (isControlCharacter(*c)) {
227-
JSONCPP_OSTRINGSTREAM oss;
228-
oss << "\\u" << std::hex << std::uppercase << std::setfill('0')
229-
<< std::setw(4) << static_cast<int>(*c);
230-
result += oss.str();
231-
} else {
232-
result += *c;
233-
}
234-
break;
235-
}
161+
static bool isAnyCharRequiredQuoting(char const* s, size_t n) {
162+
assert(s || !n);
163+
164+
char const* const end = s + n;
165+
for (char const* cur = s; cur < end; ++cur) {
166+
if (*cur == '\\' || *cur == '\"' || *cur < ' '
167+
|| static_cast<unsigned char>(*cur) < 0x80)
168+
return true;
236169
}
237-
result += "\"";
238-
return result;
170+
return false;
239171
}
240172

241-
// https://github.com/upcaste/upcaste/blob/master/src/upcore/src/cstring/strnpbrk.cpp
242-
static char const* strnpbrk(char const* s, char const* accept, size_t n) {
243-
assert((s || !n) && accept);
173+
static unsigned int utf8ToCodepoint(const char*& s, const char* e) {
174+
const unsigned int REPLACEMENT_CHARACTER = 0xFFFD;
244175

245-
char const* const end = s + n;
246-
for (char const* cur = s; cur < end; ++cur) {
247-
int const c = *cur;
248-
for (char const* a = accept; *a; ++a) {
249-
if (*a == c) {
250-
return cur;
251-
}
252-
}
176+
unsigned int firstByte = static_cast<unsigned char>(*s);
177+
178+
if (firstByte < 0x80)
179+
return firstByte;
180+
181+
if (firstByte < 0xE0) {
182+
if (e - s < 2)
183+
return REPLACEMENT_CHARACTER;
184+
185+
unsigned int calculated = ((firstByte & 0x1F) << 6)
186+
| (static_cast<unsigned int>(s[1]) & 0x3F);
187+
s += 1;
188+
// oversized encoded characters are invalid
189+
return calculated < 0x80 ? REPLACEMENT_CHARACTER : calculated;
253190
}
254-
return NULL;
191+
192+
if (firstByte < 0xF0) {
193+
if (e - s < 3)
194+
return REPLACEMENT_CHARACTER;
195+
196+
unsigned int calculated = ((firstByte & 0x0F) << 12)
197+
| ((static_cast<unsigned int>(s[1]) & 0x3F) << 6)
198+
| (static_cast<unsigned int>(s[2]) & 0x3F);
199+
s += 2;
200+
// surrogates aren't valid codepoints itself
201+
// shouldn't be UTF-8 encoded
202+
if (calculated >= 0xD800 && calculated >= 0xDFFF)
203+
return REPLACEMENT_CHARACTER;
204+
// oversized encoded characters are invalid
205+
return calculated < 0x800 ? REPLACEMENT_CHARACTER : calculated;
206+
}
207+
208+
if (firstByte < 0xF8) {
209+
if (e - s < 4)
210+
return REPLACEMENT_CHARACTER;
211+
212+
unsigned int calculated = ((firstByte & 0x07) << 24)
213+
| ((static_cast<unsigned int>(s[1]) & 0x3F) << 12)
214+
| ((static_cast<unsigned int>(s[2]) & 0x3F) << 6)
215+
| (static_cast<unsigned int>(s[3]) & 0x3F);
216+
s += 3;
217+
// oversized encoded characters are invalid
218+
return calculated < 0x10000 ? REPLACEMENT_CHARACTER : calculated;
219+
}
220+
221+
return REPLACEMENT_CHARACTER;
255222
}
223+
224+
static const char hex2[] =
225+
"000102030405060708090a0b0c0d0e0f"
226+
"101112131415161718191a1b1c1d1e1f"
227+
"202122232425262728292a2b2c2d2e2f"
228+
"303132333435363738393a3b3c3d3e3f"
229+
"404142434445464748494a4b4c4d4e4f"
230+
"505152535455565758595a5b5c5d5e5f"
231+
"606162636465666768696a6b6c6d6e6f"
232+
"707172737475767778797a7b7c7d7e7f"
233+
"808182838485868788898a8b8c8d8e8f"
234+
"909192939495969798999a9b9c9d9e9f"
235+
"a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
236+
"b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
237+
"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
238+
"d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
239+
"e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
240+
"f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
241+
242+
static JSONCPP_STRING toHex16Bit(unsigned int x) {
243+
const unsigned int hi = (x >> 8) & 0xff;
244+
const unsigned int lo = x & 0xff;
245+
JSONCPP_STRING result(4, ' ');
246+
result[0] = hex2[2 * hi];
247+
result[1] = hex2[2 * hi + 1];
248+
result[2] = hex2[2 * lo];
249+
result[3] = hex2[2 * lo + 1];
250+
return result;
251+
}
252+
256253
static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) {
257254
if (value == NULL)
258255
return "";
259-
// Not sure how to handle unicode...
260-
if (strnpbrk(value, "\"\\\b\f\n\r\t", length) == NULL &&
261-
!containsControlCharacter0(value, length))
256+
257+
if (!isAnyCharRequiredQuoting(value, length))
262258
return JSONCPP_STRING("\"") + value + "\"";
263259
// We have to walk value and escape any special characters.
264260
// Appending to JSONCPP_STRING is not efficient, but this should be rare.
@@ -300,14 +296,24 @@ static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) {
300296
// sequence.
301297
// Should add a flag to allow this compatibility mode and prevent this
302298
// sequence from occurring.
303-
default:
304-
if ((isControlCharacter(*c)) || (*c == 0)) {
305-
JSONCPP_OSTRINGSTREAM oss;
306-
oss << "\\u" << std::hex << std::uppercase << std::setfill('0')
307-
<< std::setw(4) << static_cast<int>(*c);
308-
result += oss.str();
309-
} else {
310-
result += *c;
299+
default: {
300+
unsigned int cp = utf8ToCodepoint(c, end);
301+
// don't escape non-control characters
302+
// (short escape sequence are applied above)
303+
if (cp < 0x80 && cp >= 0x20)
304+
result += static_cast<char>(cp);
305+
else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane
306+
result += "\\u";
307+
result += toHex16Bit(cp);
308+
}
309+
else { // codepoint is not in Basic Multilingual Plane
310+
// convert to surrogate pair first
311+
cp -= 0x10000;
312+
result += "\\u";
313+
result += toHex16Bit((cp >> 10) + 0xD800);
314+
result += "\\u";
315+
result += toHex16Bit((cp & 0x3FF) + 0xDC00);
316+
}
311317
}
312318
break;
313319
}
@@ -316,6 +322,10 @@ static JSONCPP_STRING valueToQuotedStringN(const char* value, unsigned length) {
316322
return result;
317323
}
318324

325+
JSONCPP_STRING valueToQuotedString(const char* value) {
326+
return valueToQuotedStringN(value, static_cast<unsigned int>(strlen(value)));
327+
}
328+
319329
// Class Writer
320330
// //////////////////////////////////////////////////////////////////
321331
Writer::~Writer() {}

0 commit comments

Comments
 (0)