Skip to content

Commit 80a2027

Browse files
[JsonPath] Better handling of Unicode chars in expressions
1 parent fed75eb commit 80a2027

File tree

7 files changed

+359
-6
lines changed

7 files changed

+359
-6
lines changed

JsonCrawler.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ private function evaluateBracket(string $expr, mixed $value): array
230230

231231
// quoted strings for object keys
232232
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
233-
$key = stripslashes($matches[2]);
233+
$key = JsonPathUtils::unescapeString($matches[2], $matches[1]);
234234

235235
return \array_key_exists($key, $value) ? [$value[$key]] : [];
236236
}
@@ -335,7 +335,7 @@ private function evaluateScalar(string $expr, array $context): mixed
335335

336336
// string literals
337337
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
338-
return $matches[2];
338+
return JsonPathUtils::unescapeString($matches[2], $matches[1]);
339339
}
340340

341341
// current node references

JsonCrawlerInterface.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ interface JsonCrawlerInterface
2525
* @return list<array|string|float|int|bool|null>
2626
*
2727
* @throws InvalidArgumentException When the JSON string provided to the crawler cannot be decoded
28-
* @throws JsonCrawlerException When a syntax error occurs in the provided JSON path
28+
* @throws JsonCrawlerException When a syntax error occurs in the provided JSON path
2929
*/
3030
public function find(string|JsonPath $query): array;
3131
}

JsonPath.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,12 @@ private function escapeKey(string $key): string
9292
"\r" => '\\r',
9393
"\t" => '\\t',
9494
"\b" => '\\b',
95-
"\f" => '\\f'
95+
"\f" => '\\f',
9696
]);
9797

98-
for ($i = 0; $i <= 31; $i++) {
98+
for ($i = 0; $i <= 31; ++$i) {
9999
if ($i < 8 || $i > 13) {
100-
$key = str_replace(chr($i), sprintf('\\u%04x', $i), $key);
100+
$key = str_replace(\chr($i), \sprintf('\\u%04x', $i), $key);
101101
}
102102
}
103103

JsonPathUtils.php

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,78 @@ public static function findSmallestDeserializableStringAndPath(array $tokens, mi
8585
'tokens' => $remainingTokens,
8686
];
8787
}
88+
89+
public static function unescapeString(string $str, string $quoteChar): string
90+
{
91+
if ('"' === $quoteChar) {
92+
// try JSON decoding first for unicode sequences
93+
$jsonStr = '"'.$str.'"';
94+
$decoded = json_decode($jsonStr, true);
95+
96+
if (null !== $decoded) {
97+
return $decoded;
98+
}
99+
}
100+
101+
$result = '';
102+
$length = \strlen($str);
103+
104+
for ($i = 0; $i < $length; ++$i) {
105+
if ('\\' === $str[$i] && $i + 1 < $length) {
106+
$result .= match ($str[$i + 1]) {
107+
'"' => '"',
108+
"'" => "'",
109+
'\\' => '\\',
110+
'/' => '/',
111+
'b' => "\b",
112+
'f' => "\f",
113+
'n' => "\n",
114+
'r' => "\r",
115+
't' => "\t",
116+
'u' => self::unescapeUnicodeSequence($str, $length, $i),
117+
default => $str[$i].$str[$i + 1], // keep the backslash
118+
};
119+
120+
++$i;
121+
} else {
122+
$result .= $str[$i];
123+
}
124+
}
125+
126+
return $result;
127+
}
128+
129+
private static function unescapeUnicodeSequence(string $str, int $length, int &$i): string
130+
{
131+
if ($i + 5 >= $length) {
132+
// not enough characters for Unicode escape, treat as literal
133+
return $str[$i];
134+
}
135+
136+
$hex = substr($str, $i + 2, 4);
137+
if (!ctype_xdigit($hex)) {
138+
// invalid hex, treat as literal
139+
return $str[$i];
140+
}
141+
142+
$codepoint = hexdec($hex);
143+
// looks like a valid Unicode codepoint, string length is sufficient and it starts with \u
144+
if (0xD800 <= $codepoint && $codepoint <= 0xDBFF && $i + 11 < $length && '\\' === $str[$i + 6] && 'u' === $str[$i + 7]) {
145+
$lowHex = substr($str, $i + 8, 4);
146+
if (ctype_xdigit($lowHex)) {
147+
$lowSurrogate = hexdec($lowHex);
148+
if (0xDC00 <= $lowSurrogate && $lowSurrogate <= 0xDFFF) {
149+
$codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF);
150+
$i += 10; // skip surrogate pair
151+
152+
return mb_chr($codepoint, 'UTF-8');
153+
}
154+
}
155+
}
156+
157+
// single Unicode character or invalid surrogate, skip the sequence
158+
$i += 4;
159+
160+
return mb_chr($codepoint, 'UTF-8');
161+
}
88162
}

0 commit comments

Comments
 (0)