Skip to content

Commit

Permalink
Merge pull request #256 from ViliusS/fix/highlighter-fixes
Browse files Browse the repository at this point in the history
Fix unicode and word tokens in highlighter.
  • Loading branch information
nticaric authored Oct 29, 2021
2 parents 1fd7b79 + e8a9848 commit 4225e12
Showing 1 changed file with 64 additions and 50 deletions.
114 changes: 64 additions & 50 deletions src/Support/Highlighter.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ public function __construct(TokenizerInterface $tokenizer = null)
}
}

/**
* @param $text
* @param $needle
* @param string $tag
* @param array $options
*
* @return string
*/
/**
* @param $text
* @param $needle
* @param string $tag
* @param array $options
*
* @return string
*/
public function highlight($text, $needle, $tag = 'em', $options = [])
{
$this->options = array_merge($this->options, $options);
Expand Down Expand Up @@ -90,25 +90,25 @@ public function highlight($text, $needle, $tag = 'em', $options = [])
return $text;
}

/**
* find the locations of each of the words
* Nothing exciting here. The array_unique is required
* unless you decide to make the words unique before passing in
*
* @param $words
* @param $fulltext
*
* @return array
*/
/**
* find the locations of each of the words
* Nothing exciting here. The array_unique is required
* unless you decide to make the words unique before passing in
*
* @param $words
* @param $fulltext
*
* @return array
*/
public function _extractLocations($words, $fulltext)
{
$locations = array();
foreach ($words as $word) {
$wordlen = strlen($word);
$loc = stripos($fulltext, $word);
$wordlen = mb_strlen($word);
$loc = mb_stripos($fulltext, $word);
while ($loc !== false) {
$locations[] = $loc;
$loc = stripos($fulltext, $word, $loc + $wordlen);
$loc = mb_stripos($fulltext, $word, $loc + $wordlen);
}
}
$locations = array_unique($locations);
Expand All @@ -117,19 +117,19 @@ public function _extractLocations($words, $fulltext)
return $locations;
}

/**
* Work out which is the most relevant portion to display
* This is done by looping over each match and finding the smallest distance between two found
* strings. The idea being that the closer the terms are the better match the snippet would be.
* When checking for matches we only change the location if there is a better match.
* The only exception is where we have only two matches in which case we just take the
* first as will be equally distant.
*
* @param $locations
* @param $prevcount
*
* @return int
*/
/**
* Work out which is the most relevant portion to display
* This is done by looping over each match and finding the smallest distance between two found
* strings. The idea being that the closer the terms are the better match the snippet would be.
* When checking for matches we only change the location if there is a better match.
* The only exception is where we have only two matches in which case we just take the
* first as will be equally distant.
*
* @param $locations
* @param $prevcount
*
* @return int
*/
public function _determineSnipLocation($locations, $prevcount)
{
if (!isset($locations[0])) {
Expand Down Expand Up @@ -162,22 +162,22 @@ public function _determineSnipLocation($locations, $prevcount)
return $startpos;
}

/**
* 1/6 ratio on prevcount tends to work pretty well and puts the terms
* in the middle of the extract
*
* @param $words
* @param $fulltext
* @param int $rellength
* @param int $prevcount
* @param string $indicator
*
* @return bool|string
*/
/**
* 1/6 ratio on prevcount tends to work pretty well and puts the terms
* in the middle of the extract
*
* @param $words
* @param $fulltext
* @param int $rellength
* @param int $prevcount
* @param string $indicator
*
* @return bool|string
*/
public function extractRelevant($words, $fulltext, $rellength = 300, $prevcount = 50, $indicator = '...')
{
$words = preg_split($this->tokenizer->getPattern(), $words, -1, PREG_SPLIT_NO_EMPTY);
$textlength = strlen($fulltext);
$textlength = mb_strlen($fulltext);
if ($textlength <= $rellength) {
return $fulltext;
}
Expand All @@ -189,16 +189,30 @@ public function extractRelevant($words, $fulltext, $rellength = 300, $prevcount
$startpos = $startpos - ($textlength - $startpos) / 2;
}

$reltext = substr($fulltext, $startpos, $rellength);
// in case no match is found, reset position for proper math below
if ($startpos == -1) {
$startpos = 0;
}

$reltext = mb_substr($fulltext, $startpos, $rellength);
preg_match_all($this->tokenizer->getPattern(), $reltext, $offset, PREG_OFFSET_CAPTURE);
// since PREG_OFFSET_CAPTURE returns offset in bytes we have to use mb_strlen(substr()) hack here
$last = mb_strlen(substr($reltext, 0, end($offset[0])[1]));
$first = mb_strlen(substr($reltext, 0, $offset[0][0][1]));

// if no match is found, just return first $rellength characters without the last word
if (empty($locations)) {
return mb_substr($reltext, 0, $last) . $indicator;
}

// check to ensure we dont snip the last word if thats the match
if ($startpos + $rellength < $textlength) {
$reltext = substr($reltext, 0, strrpos($reltext, " ")) . $indicator; // remove last word
$reltext = mb_substr($reltext, 0, $last) . $indicator; // remove last word
}

// If we trimmed from the front add ...
if ($startpos != 0) {
$reltext = $indicator . substr($reltext, strpos($reltext, " ") + 1); // remove first word
$reltext = $indicator . mb_substr($reltext, $first + 1); // remove first word
}

return $reltext;
Expand Down

0 comments on commit 4225e12

Please sign in to comment.