Skip to content

Commit ebfa683

Browse files
committed
Add grapheme_levenshtein function.
Measure levenshtein for grapheme cluster unit
1 parent c919ab4 commit ebfa683

File tree

6 files changed

+359
-1
lines changed

6 files changed

+359
-1
lines changed

NEWS

+1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ PHP NEWS
8989
. Added Locale::isRightToLeft to check if a locale is written right to left.
9090
(David Carlier)
9191
. Added null bytes presence in locale inputs for Locale class. (David Carlier)
92+
. Added grapheme_levenshtein() function. (Yuya Hamada)
9293

9394
- MySQLi:
9495
. Fixed bugs GH-17900 and GH-8084 (calling mysqli::__construct twice).

UPGRADING

+2
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,8 @@ PHP 8.5 UPGRADE NOTES
317317
- Intl:
318318
. Added locale_is_right_to_left/Locale::isRightToLeft, returns true if
319319
the locale is written right to left (after its enrichment with likely subtags).
320+
. Added grapheme_levenshtein() function.
321+
RFC: https://wiki.php.net/rfc/grapheme_levenshtein
320322

321323
- Pdo\Sqlite:
322324
. Added support for Pdo\Sqlite::setAuthorizer(), which is the equivalent of

ext/intl/grapheme/grapheme_string.c

+215
Original file line numberDiff line numberDiff line change
@@ -918,4 +918,219 @@ PHP_FUNCTION(grapheme_str_split)
918918
ubrk_close(bi);
919919
}
920920

921+
PHP_FUNCTION(grapheme_levenshtein)
922+
{
923+
zend_string *string1, *string2;
924+
zend_long cost_ins = 1;
925+
zend_long cost_rep = 1;
926+
zend_long cost_del = 1;
927+
928+
ZEND_PARSE_PARAMETERS_START(2, 5)
929+
Z_PARAM_STR(string1)
930+
Z_PARAM_STR(string2)
931+
Z_PARAM_OPTIONAL
932+
Z_PARAM_LONG(cost_ins)
933+
Z_PARAM_LONG(cost_rep)
934+
Z_PARAM_LONG(cost_del)
935+
ZEND_PARSE_PARAMETERS_END();
936+
937+
if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
938+
zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
939+
RETURN_THROWS();
940+
}
941+
942+
if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) {
943+
zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
944+
RETURN_THROWS();
945+
}
946+
947+
if (cost_del <= 0 || cost_del > UINT_MAX / 4) {
948+
zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
949+
RETURN_THROWS();
950+
}
951+
952+
zend_long c0, c1, c2;
953+
zend_long retval;
954+
size_t i2;
955+
char *pstr1, *pstr2;
956+
957+
UChar *ustring1 = NULL;
958+
UChar *ustring2 = NULL;
959+
960+
int32_t ustring1_len = 0;
961+
int32_t ustring2_len = 0;
962+
963+
UErrorCode ustatus = U_ZERO_ERROR;
964+
965+
/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
966+
* that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time)
967+
* by having shorter rows (p1 & p2). */
968+
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
969+
zend_string *tmp = string1;
970+
string1 = string2;
971+
string2 = tmp;
972+
}
973+
974+
pstr1 = ZSTR_VAL(string1);
975+
pstr2 = ZSTR_VAL(string2);
976+
977+
intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus);
978+
979+
if (U_FAILURE(ustatus)) {
980+
intl_error_set_code(NULL, ustatus);
981+
982+
intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
983+
efree(ustring1);
984+
RETURN_FALSE;
985+
}
986+
987+
intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus);
988+
989+
if (U_FAILURE(ustatus)) {
990+
intl_error_set_code(NULL, ustatus);
991+
992+
intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
993+
efree(ustring2);
994+
efree(ustring1);
995+
RETURN_FALSE;
996+
}
997+
998+
UBreakIterator *bi1, *bi2;
999+
1000+
int32_t strlen_1, strlen_2;
1001+
strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0);
1002+
strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0);
1003+
1004+
if (strlen_1 == 0) {
1005+
efree(ustring1);
1006+
efree(ustring2);
1007+
RETURN_LONG(strlen_2 * cost_ins);
1008+
}
1009+
if (strlen_2 == 0) {
1010+
efree(ustring1);
1011+
efree(ustring2);
1012+
RETURN_LONG(strlen_1 * cost_del);
1013+
}
1014+
1015+
unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE];
1016+
unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE];
1017+
bi1 = grapheme_get_break_iterator(u_break_iterator_buffer1, &ustatus);
1018+
if (U_FAILURE(ustatus)) {
1019+
intl_error_set_code(NULL, ustatus);
1020+
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #1 ($string1)", 0);
1021+
efree(ustring2);
1022+
efree(ustring1);
1023+
ubrk_close(bi1);
1024+
RETURN_FALSE;
1025+
}
1026+
1027+
bi2 = grapheme_get_break_iterator(u_break_iterator_buffer2, &ustatus);
1028+
if (U_FAILURE(ustatus)) {
1029+
intl_error_set_code(NULL, ustatus);
1030+
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #2 ($string2)", 0);
1031+
efree(ustring2);
1032+
efree(ustring1);
1033+
ubrk_close(bi2);
1034+
ubrk_close(bi1);
1035+
RETURN_FALSE;
1036+
}
1037+
ubrk_setText(bi1, ustring1, ustring1_len, &ustatus);
1038+
1039+
if (U_FAILURE(ustatus)) {
1040+
intl_error_set_code(NULL, ustatus);
1041+
1042+
intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #1 ($string1)", 0);
1043+
efree(ustring2);
1044+
efree(ustring1);
1045+
ubrk_close(bi2);
1046+
ubrk_close(bi1);
1047+
RETURN_FALSE;
1048+
}
1049+
1050+
ubrk_setText(bi2, ustring2, ustring2_len, &ustatus);
1051+
if (U_FAILURE(ustatus)) {
1052+
intl_error_set_code(NULL, ustatus);
1053+
1054+
intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #2 ($string2)", 0);
1055+
efree(ustring2);
1056+
efree(ustring1);
1057+
ubrk_close(bi2);
1058+
ubrk_close(bi1);
1059+
RETURN_FALSE;
1060+
}
1061+
UCollator *collator = ucol_open("", &ustatus);
1062+
if (U_FAILURE(ustatus)) {
1063+
intl_error_set_code(NULL, ustatus);
1064+
1065+
intl_error_set_custom_msg(NULL, "Error on ucol_open", 0);
1066+
efree(ustring2);
1067+
efree(ustring1);
1068+
ubrk_close(bi2);
1069+
ubrk_close(bi1);
1070+
ucol_close(collator);
1071+
RETURN_FALSE;
1072+
}
1073+
1074+
zend_long *p1, *p2, *tmp;
1075+
p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
1076+
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
1077+
1078+
for (i2 = 0; i2 <= strlen_2; i2++) {
1079+
p1[i2] = i2 * cost_ins;
1080+
}
1081+
1082+
int32_t current1 = 0;
1083+
int32_t current2 = 0;
1084+
int32_t pos1 = 0;
1085+
int32_t pos2 = 0;
1086+
1087+
while (true) {
1088+
current1 = ubrk_current(bi1);
1089+
pos1 = ubrk_next(bi1);
1090+
if (pos1 == UBRK_DONE) {
1091+
break;
1092+
}
1093+
p2[0] = p1[0] + cost_del;
1094+
for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) {
1095+
current2 = ubrk_current(bi2);
1096+
pos2 = ubrk_next(bi2);
1097+
if (pos2 == UBRK_DONE) {
1098+
break;
1099+
}
1100+
if (ucol_strcoll(collator, ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2) == UCOL_EQUAL) {
1101+
c0 = p1[i2];
1102+
} else {
1103+
c0 = p1[i2] + cost_rep;
1104+
}
1105+
c1 = p1[i2 + 1] + cost_del;
1106+
if (c1 < c0) {
1107+
c0 = c1;
1108+
}
1109+
c2 = p2[i2] + cost_ins;
1110+
if (c2 < c0) {
1111+
c0 = c2;
1112+
}
1113+
p2[i2 + 1] = c0;
1114+
}
1115+
ubrk_first(bi2);
1116+
tmp = p1;
1117+
p1 = p2;
1118+
p2 = tmp;
1119+
}
1120+
1121+
ucol_close(collator);
1122+
1123+
ubrk_close(bi1);
1124+
ubrk_close(bi2);
1125+
1126+
efree(ustring1);
1127+
efree(ustring2);
1128+
1129+
retval = p1[strlen_2];
1130+
1131+
efree(p1);
1132+
efree(p2);
1133+
RETURN_LONG(retval);
1134+
}
1135+
9211136
/* }}} */

ext/intl/php_intl.stub.php

+2
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle =
447447

448448
function grapheme_str_split(string $string, int $length = 1): array|false {}
449449

450+
function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}
451+
450452
/** @param int $next */
451453
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}
452454

ext/intl/php_intl_arginfo.h

+11-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)