From 49e91edf0109f9fe561b5c30960ffac863203a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vilius=20=C5=A0umskas?= Date: Fri, 29 Oct 2021 01:57:25 +0300 Subject: [PATCH 1/4] Fix unicode support in highlighter Previously, highlighter could snip text at UTF-8 control producing completely empty relevant string. The patch changes highlighter operation from byte to string level. --- src/Support/Highlighter.php | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Support/Highlighter.php b/src/Support/Highlighter.php index 01bf151..417bcc3 100644 --- a/src/Support/Highlighter.php +++ b/src/Support/Highlighter.php @@ -104,11 +104,11 @@ public function _extractLocations($words, $fulltext) { $locations = array(); foreach ($words as $word) { - $wordlen = strlen($word); - $loc = stripos($fulltext, $word); + $wordlen = mb_strlen($word); + $loc = mb_stripos($fulltext, $word); while ($loc !== false) { $locations[] = $loc; - $loc = stripos($fulltext, $word, $loc + $wordlen); + $loc = mb_stripos($fulltext, $word, $loc + $wordlen); } } $locations = array_unique($locations); @@ -177,7 +177,7 @@ public function _determineSnipLocation($locations, $prevcount) public function extractRelevant($words, $fulltext, $rellength = 300, $prevcount = 50, $indicator = '...') { $words = preg_split($this->tokenizer->getPattern(), $words, -1, PREG_SPLIT_NO_EMPTY); - $textlength = strlen($fulltext); + $textlength = mb_strlen($fulltext); if ($textlength <= $rellength) { return $fulltext; } @@ -189,16 +189,16 @@ public function extractRelevant($words, $fulltext, $rellength = 300, $prevcount $startpos = $startpos - ($textlength - $startpos) / 2; } - $reltext = substr($fulltext, $startpos, $rellength); + $reltext = mb_substr($fulltext, $startpos, $rellength); // check to ensure we dont snip the last word if thats the match if ($startpos + $rellength < $textlength) { - $reltext = substr($reltext, 0, strrpos($reltext, " ")) . $indicator; // remove last word + $reltext = mb_substr($reltext, 0, mb_strrpos($reltext, " ")) . $indicator; // remove last word } // If we trimmed from the front add ... if ($startpos != 0) { - $reltext = $indicator . substr($reltext, strpos($reltext, " ") + 1); // remove first word + $reltext = $indicator . mb_substr($reltext, mb_strpos($reltext, " ") + 1); // remove first word } return $reltext; From 6cf98021e29b59689a4a6c71fce5b7a87d8718d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vilius=20=C5=A0umskas?= Date: Fri, 29 Oct 2021 02:30:54 +0300 Subject: [PATCH 2/4] Use word spliting pattern from Tokenizer. We cannot just split by space here because: a) the text could containt just one big string (when seaching JSON data for example), in which case relevant text is extracted at completely different position, b) custom Tokenizer could be used, which in theory could contain no spaces. Patch fixes both these issues. --- src/Support/Highlighter.php | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/Support/Highlighter.php b/src/Support/Highlighter.php index 417bcc3..4f09f67 100644 --- a/src/Support/Highlighter.php +++ b/src/Support/Highlighter.php @@ -189,16 +189,30 @@ public function extractRelevant($words, $fulltext, $rellength = 300, $prevcount $startpos = $startpos - ($textlength - $startpos) / 2; } + // in case no match is found, reset position for proper math below + if ($startpos == -1) { + $startpos = 0; + } + $reltext = mb_substr($fulltext, $startpos, $rellength); + preg_match_all($this->tokenizer->getPattern(), $reltext, $offset, PREG_OFFSET_CAPTURE); + // since PREG_OFFSET_CAPTURE returns offset in bytes we have to use mb_strlen(substr()) hack here + $last = mb_strlen(substr($reltext, 0, end($offset[0])[1])); + $first = mb_strlen(substr($reltext, 0, $offset[0][0][1])); + + // if no match is found, just return first $rellength characters without the last word + if (empty($locations)) { + return mb_substr($reltext, 0, $last) . $indicator; + } // check to ensure we dont snip the last word if thats the match if ($startpos + $rellength < $textlength) { - $reltext = mb_substr($reltext, 0, mb_strrpos($reltext, " ")) . $indicator; // remove last word + $reltext = mb_substr($reltext, 0, $last) . $indicator; // remove last word } // If we trimmed from the front add ... if ($startpos != 0) { - $reltext = $indicator . mb_substr($reltext, mb_strpos($reltext, " ") + 1); // remove first word + $reltext = $indicator . mb_substr($reltext, $first + 1); // remove first word } return $reltext; From 1b5c586aeb0bf82f6ab56b41ebdb84aa05a45c9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vilius=20=C5=A0umskas?= Date: Fri, 29 Oct 2021 02:34:01 +0300 Subject: [PATCH 3/4] Fix spacing in docblocks --- src/Support/Highlighter.php | 86 ++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/src/Support/Highlighter.php b/src/Support/Highlighter.php index 4f09f67..aa42070 100644 --- a/src/Support/Highlighter.php +++ b/src/Support/Highlighter.php @@ -27,14 +27,14 @@ public function __construct(TokenizerInterface $tokenizer = null) } } - /** - * @param $text - * @param $needle - * @param string $tag - * @param array $options - * - * @return string - */ + /** + * @param $text + * @param $needle + * @param string $tag + * @param array $options + * + * @return string + */ public function highlight($text, $needle, $tag = 'em', $options = []) { $this->options = array_merge($this->options, $options); @@ -90,16 +90,16 @@ public function highlight($text, $needle, $tag = 'em', $options = []) return $text; } - /** - * find the locations of each of the words - * Nothing exciting here. The array_unique is required - * unless you decide to make the words unique before passing in - * - * @param $words - * @param $fulltext - * - * @return array - */ + /** + * find the locations of each of the words + * Nothing exciting here. The array_unique is required + * unless you decide to make the words unique before passing in + * + * @param $words + * @param $fulltext + * + * @return array + */ public function _extractLocations($words, $fulltext) { $locations = array(); @@ -117,19 +117,19 @@ public function _extractLocations($words, $fulltext) return $locations; } - /** - * Work out which is the most relevant portion to display - * This is done by looping over each match and finding the smallest distance between two found - * strings. The idea being that the closer the terms are the better match the snippet would be. - * When checking for matches we only change the location if there is a better match. - * The only exception is where we have only two matches in which case we just take the - * first as will be equally distant. - * - * @param $locations - * @param $prevcount - * - * @return int - */ + /** + * Work out which is the most relevant portion to display + * This is done by looping over each match and finding the smallest distance between two found + * strings. The idea being that the closer the terms are the better match the snippet would be. + * When checking for matches we only change the location if there is a better match. + * The only exception is where we have only two matches in which case we just take the + * first as will be equally distant. + * + * @param $locations + * @param $prevcount + * + * @return int + */ public function _determineSnipLocation($locations, $prevcount) { if (!isset($locations[0])) { @@ -162,18 +162,18 @@ public function _determineSnipLocation($locations, $prevcount) return $startpos; } - /** - * 1/6 ratio on prevcount tends to work pretty well and puts the terms - * in the middle of the extract - * - * @param $words - * @param $fulltext - * @param int $rellength - * @param int $prevcount - * @param string $indicator - * - * @return bool|string - */ + /** + * 1/6 ratio on prevcount tends to work pretty well and puts the terms + * in the middle of the extract + * + * @param $words + * @param $fulltext + * @param int $rellength + * @param int $prevcount + * @param string $indicator + * + * @return bool|string + */ public function extractRelevant($words, $fulltext, $rellength = 300, $prevcount = 50, $indicator = '...') { $words = preg_split($this->tokenizer->getPattern(), $words, -1, PREG_SPLIT_NO_EMPTY); From e8a9848c984b951b889d3f1bf9da11385c2f422c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vilius=20=C5=A0umskas?= Date: Fri, 29 Oct 2021 02:40:33 +0300 Subject: [PATCH 4/4] Correctly fix docblock spacing this time --- src/Support/Highlighter.php | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Support/Highlighter.php b/src/Support/Highlighter.php index aa42070..9d82449 100644 --- a/src/Support/Highlighter.php +++ b/src/Support/Highlighter.php @@ -28,13 +28,13 @@ public function __construct(TokenizerInterface $tokenizer = null) } /** - * @param $text - * @param $needle - * @param string $tag - * @param array $options - * - * @return string - */ + * @param $text + * @param $needle + * @param string $tag + * @param array $options + * + * @return string + */ public function highlight($text, $needle, $tag = 'em', $options = []) { $this->options = array_merge($this->options, $options);