@@ -85,4 +85,78 @@ public static function findSmallestDeserializableStringAndPath(array $tokens, mi
8585 'tokens ' => $ remainingTokens ,
8686 ];
8787 }
88+
89+ public static function unescapeString (string $ str , string $ quoteChar ): string
90+ {
91+ if ('" ' === $ quoteChar ) {
92+ // try JSON decoding first for unicode sequences
93+ $ jsonStr = '" ' .$ str .'" ' ;
94+ $ decoded = json_decode ($ jsonStr , true );
95+
96+ if (null !== $ decoded ) {
97+ return $ decoded ;
98+ }
99+ }
100+
101+ $ result = '' ;
102+ $ length = \strlen ($ str );
103+
104+ for ($ i = 0 ; $ i < $ length ; ++$ i ) {
105+ if ('\\' === $ str [$ i ] && $ i + 1 < $ length ) {
106+ $ result .= match ($ str [$ i + 1 ]) {
107+ '" ' => '" ' ,
108+ "' " => "' " ,
109+ '\\' => '\\' ,
110+ '/ ' => '/ ' ,
111+ 'b ' => "\b " ,
112+ 'f ' => "\f" ,
113+ 'n ' => "\n" ,
114+ 'r ' => "\r" ,
115+ 't ' => "\t" ,
116+ 'u ' => self ::unescapeUnicodeSequence ($ str , $ length , $ i ),
117+ default => $ str [$ i ].$ str [$ i + 1 ], // keep the backslash
118+ };
119+
120+ ++$ i ;
121+ } else {
122+ $ result .= $ str [$ i ];
123+ }
124+ }
125+
126+ return $ result ;
127+ }
128+
129+ private static function unescapeUnicodeSequence (string $ str , int $ length , int &$ i ): string
130+ {
131+ if ($ i + 5 >= $ length ) {
132+ // not enough characters for Unicode escape, treat as literal
133+ return $ str [$ i ];
134+ }
135+
136+ $ hex = substr ($ str , $ i + 2 , 4 );
137+ if (!ctype_xdigit ($ hex )) {
138+ // invalid hex, treat as literal
139+ return $ str [$ i ];
140+ }
141+
142+ $ codepoint = hexdec ($ hex );
143+ // looks like a valid Unicode codepoint, string length is sufficient and it starts with \u
144+ if (0xD800 <= $ codepoint && $ codepoint <= 0xDBFF && $ i + 11 < $ length && '\\' === $ str [$ i + 6 ] && 'u ' === $ str [$ i + 7 ]) {
145+ $ lowHex = substr ($ str , $ i + 8 , 4 );
146+ if (ctype_xdigit ($ lowHex )) {
147+ $ lowSurrogate = hexdec ($ lowHex );
148+ if (0xDC00 <= $ lowSurrogate && $ lowSurrogate <= 0xDFFF ) {
149+ $ codepoint = 0x10000 + (($ codepoint & 0x3FF ) << 10 ) + ($ lowSurrogate & 0x3FF );
150+ $ i += 10 ; // skip surrogate pair
151+
152+ return mb_chr ($ codepoint , 'UTF-8 ' );
153+ }
154+ }
155+ }
156+
157+ // single Unicode character or invalid surrogate, skip the sequence
158+ $ i += 4 ;
159+
160+ return mb_chr ($ codepoint , 'UTF-8 ' );
161+ }
88162}
0 commit comments