diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index 77bf4319928a8..c5ac6da7b347a 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -918,4 +918,185 @@ PHP_FUNCTION(grapheme_str_split) ubrk_close(bi); } +PHP_FUNCTION(grapheme_levenshtein) +{ + zend_string *string1, *string2; + zend_long cost_ins = 1; + zend_long cost_rep = 1; + zend_long cost_del = 1; + + ZEND_PARSE_PARAMETERS_START(2, 5) + Z_PARAM_STR(string1) + Z_PARAM_STR(string2) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(cost_ins) + Z_PARAM_LONG(cost_rep) + Z_PARAM_LONG(cost_del) + ZEND_PARSE_PARAMETERS_END(); + + if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) { + zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) { + zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + if (cost_del <= 0 || cost_del > UINT_MAX / 4) { + zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + zend_long *p1, *p2, *tmp; + zend_long c0, c1, c2; + zend_long retval; + size_t i2; + char *pstr1, *pstr2; + + UChar *ustring1 = NULL; + UChar *ustring2 = NULL; + + int32_t ustring1_len = 0; + int32_t ustring2_len = 0; + + UErrorCode ustatus1 = U_ZERO_ERROR; + UErrorCode ustatus2 = U_ZERO_ERROR; + + /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means + * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time) + * by having shorter rows (p1 & p2). */ + if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) { + zend_string *tmp = string1; + string1 = string2; + string2 = tmp; + } + + pstr1 = ZSTR_VAL(string1); + pstr2 = ZSTR_VAL(string2); + + intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus1); + + if (U_FAILURE(ustatus1)) { + /* Set global error code. */ + intl_error_set_code( NULL, ustatus1 ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); + if (ustring1) { + efree(ustring1); + } + RETURN_FALSE; + } + + intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus2); + + if (U_FAILURE(ustatus2)) { + /* Set global error code. */ + intl_error_set_code(NULL, ustatus2); + + /* Set error messages. */ + intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); + if (ustring2) { + efree(ustring2); + } + if (ustring1) { + efree(ustring1); + } + RETURN_FALSE; + } + + UText *ut1 = NULL; + UText *ut2 = NULL; + UBreakIterator *bi1, *bi2; + + int32_t strlen_1, strlen_2; + strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0); + strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0); + + if (strlen_1 == 0) { + efree(ustring1); + efree(ustring2); + RETURN_LONG(strlen_2 * cost_ins); + } + if (strlen_2 == 0) { + efree(ustring1); + efree(ustring2); + RETURN_LONG(strlen_1 * cost_del); + } + + unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE]; + unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE]; + bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1); + bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2); + + ut1 = utext_openUTF8(ut1, pstr1, ZSTR_LEN(string1), &ustatus1); + ubrk_setUText(bi1, ut1, &ustatus1); + ut2 = utext_openUTF8(ut2, pstr2, ZSTR_LEN(string2), &ustatus2); + ubrk_setUText(bi2, ut2, &ustatus2); + + p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); + p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); + + for (i2 = 0; i2 <= strlen_2; i2++) { + p1[i2] = i2 * cost_ins; + } + + int32_t current1 = 0; + int32_t current2 = 0; + int32_t pos1 = 0; + int32_t pos2 = 0; + int32_t usrch_pos = 0; + for (; pos1 != UBRK_DONE;) { + current1 = ubrk_current(bi1); + pos1 = ubrk_next(bi1); + if (pos1 == UBRK_DONE) { + break; + } + p2[0] = p1[0] + cost_del; + for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) { + current2 = ubrk_current(bi2); + pos2 = ubrk_next(bi2); + if (pos2 == UBRK_DONE) { + break; + } + usrch_pos = grapheme_strpos_utf16(pstr1 + current1, pos1 - current1, pstr2 + current2, pos2 - current2, 0, NULL, 0, 0); + if (usrch_pos == 0) { + c0 = p1[i2]; + } else { + c0 = p1[i2] + cost_rep; + } + c1 = p1[i2 + 1] + cost_del; + if (c1 < c0) { + c0 = c1; + } + c2 = p2[i2] + cost_ins; + if (c2 < c0) { + c0 = c2; + } + p2[i2 + 1] = c0; + } + ubrk_first(bi2); + tmp = p1; + p1 = p2; + p2 = tmp; + } + + utext_close(ut1); + utext_close(ut2); + + ubrk_close(bi1); + ubrk_close(bi2); + + efree(ustring1); + efree(ustring2); + + retval = p1[strlen_2]; + + efree(p1); + efree(p2); + RETURN_LONG(retval); +} + /* }}} */ diff --git a/ext/intl/php_intl.stub.php b/ext/intl/php_intl.stub.php index f3a80dd511943..572c4a4b333ae 100644 --- a/ext/intl/php_intl.stub.php +++ b/ext/intl/php_intl.stub.php @@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = function grapheme_str_split(string $string, int $length = 1): array|false {} +function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {} + /** @param int $next */ function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {} diff --git a/ext/intl/php_intl_arginfo.h b/ext/intl/php_intl_arginfo.h index 11c585d8df63b..23a4a1d6fbfc6 100644 --- a/ext/intl/php_intl_arginfo.h +++ b/ext/intl/php_intl_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 168eabfdcbf29189f2327448f104ea98752d1c5a */ + * Stub hash: 5039dc739e445832b7f3e91afb6d62dc272d2fa3 */ ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1) ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null") @@ -489,6 +489,14 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_str_split, 0, 1, MAY_BE ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1") ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_levenshtein, 0, 2, MAY_BE_LONG|MAY_BE_FALSE) + ZEND_ARG_TYPE_INFO(0, string1, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, string2, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, insertion_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, replacement_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, deletion_cost, IS_LONG, 0, "1") +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE) ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0) @@ -899,6 +907,7 @@ ZEND_FUNCTION(grapheme_substr); ZEND_FUNCTION(grapheme_strstr); ZEND_FUNCTION(grapheme_stristr); ZEND_FUNCTION(grapheme_str_split); +ZEND_FUNCTION(grapheme_levenshtein); ZEND_FUNCTION(grapheme_extract); ZEND_FUNCTION(idn_to_ascii); ZEND_FUNCTION(idn_to_utf8); @@ -1086,6 +1095,7 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr) ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr) ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split) + ZEND_FE(grapheme_levenshtein, arginfo_grapheme_levenshtein) ZEND_FE(grapheme_extract, arginfo_grapheme_extract) ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii) ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8) diff --git a/ext/intl/tests/grapheme_levenshtein.phpt b/ext/intl/tests/grapheme_levenshtein.phpt new file mode 100644 index 0000000000000..eec36ea2f9801 --- /dev/null +++ b/ext/intl/tests/grapheme_levenshtein.phpt @@ -0,0 +1,104 @@ +--TEST-- +grapheme_levenshtein() function test +--EXTENSIONS-- +intl +--FILE-- +<?php +echo '--- Equal ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('12345', '12345')); + +echo '--- First string empty ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('', 'xyz')); +echo '--- Second string empty ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('xyz', '')); +echo '--- Both empty ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('', '')); +var_dump(grapheme_levenshtein('', '', 10, 10, 10)); + +echo '--- 1 character ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('1', '2')); +echo '--- 2 character swapped ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('12', '21')); + +echo '--- Inexpensive deletion ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('2121', '11', 2)); +echo '--- Expensive deletion ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('2121', '11', 2, 1, 5)); + +// +echo '--- Inexpensive insertion ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('11', '2121')); +echo '--- Expensive insertion ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('11', '2121', 5)); + +echo '--- Expensive replacement ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('111', '121', 2, 3, 2)); +echo '--- Very expensive replacement ---' . \PHP_EOL; +var_dump(grapheme_levenshtein('111', '121', 2, 9, 2)); + +echo '--- 128 codepoints ---' . \PHP_EOL; +var_dump(grapheme_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc")); +echo '--- 128 codepoints over ---' . \PHP_EOL; +var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa")); +var_dump(grapheme_levenshtein(str_repeat("a", 256) . "abc", "aaa")); +echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL; +var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", "aaa")); +echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL; +var_dump(grapheme_levenshtein("abc", str_repeat("a", 128) . "aaa")); +echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL; +var_dump(grapheme_levenshtein(str_repeat("あ", 128) . "あああ", str_repeat("あ", 128) . "あいう")); + +echo '--- Variable selector ---' . \PHP_EOL; +$ka = "カ́"; +var_dump(grapheme_levenshtein("カ", $ka)); +// variable $nabe and $nabe_E0100 is seems nothing different. +// However, $nabe_E0100 is variable selector in U+908A U+E0100. +// So grapheme_levenshtein result is maybe 0. +$nabe = '邊'; +$nabe_E0100 = "邊󠄀"; +var_dump(grapheme_levenshtein($nabe, $nabe_E0100)); + +// combining character +var_dump(grapheme_levenshtein("\u{0065}\u{0301}", "\u{00e9}")); +?> +--EXPECT-- +--- Equal --- +int(0) +--- First string empty --- +int(3) +--- Second string empty --- +int(3) +--- Both empty --- +int(0) +int(0) +--- 1 character --- +int(1) +--- 2 character swapped --- +int(2) +--- Inexpensive deletion --- +int(2) +--- Expensive deletion --- +int(10) +--- Inexpensive insertion --- +int(2) +--- Expensive insertion --- +int(10) +--- Expensive replacement --- +int(3) +--- Very expensive replacement --- +int(4) +--- 128 codepoints --- +int(2) +--- 128 codepoints over --- +int(2) +int(256) +--- 128 codepoints over only $string1 --- +int(128) +--- 128 codepoints over only $string2 --- +int(130) +--- 128 codepoints over Hiragana --- +int(2) +--- Variable selector --- +int(1) +int(0) +int(0)