Skip to content
This repository has been archived by the owner on Jan 11, 2021. It is now read-only.

Commit

Permalink
Merge pull request #49 from xserna/fulltext_searchThreshold_percentage
Browse files Browse the repository at this point in the history
Fix EZP-24213: FullText stopWordThreshold should be percentage of con…
  • Loading branch information
emodric authored Jun 29, 2017
2 parents 6631f29 + d681394 commit b7f1b1a
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 17 deletions.
11 changes: 11 additions & 0 deletions doc/bc/changes-5.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@ Changes affecting version compatibility with former or future versions.
* 5.3.4: `ViewCaching` legacy setting is now enforced and injected in legacy kernel when booted. This is to avoid persistence/Http
cache clear not working when publishing content.

* 5.3.5: Legacy Search Engine FullText searchThresholdValue -> stopWordThresholdFactor

EZP-24213: the "Stop Word Threshold" configuration, `searchThresholdValue`, was hardcoded
to 20 items. It is now changed to `stopWordThresholdFactor`, a factor (between 0 and 1)
for the percentage of content objects to set the Stop Word Threshold to. Default value
is set to 0.66, meaning if you search for a common word like "the", it will be ignored
from the search expression if more then 66% of your content contains the word.

Note: Does not affect future Solr/ElasticSearch search engines which has far more
advanced search options built in.

## Deprecations

* Method `eZ\Publish\API\Repository\RoleService::removePolicy` is deprecated in
Expand Down
11 changes: 11 additions & 0 deletions doc/bc/changes-5.4.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ Changes affecting version compatibility with former or future versions.
* `ViewCaching` legacy setting is now enforced and injected in legacy kernel when booted. This is to avoid persistence/Http
cache clear not working when publishing content.

* 5.4.2: Legacy Search Engine FullText searchThresholdValue -> stopWordThresholdFactor

EZP-24213: the "Stop Word Threshold" configuration, `searchThresholdValue`, was hardcoded
to 20 items. It is now changed to `stopWordThresholdFactor`, a factor (between 0 and 1)
for the percentage of content objects to set the Stop Word Threshold to. Default value
is set to 0.66, meaning if you search for a common word like "the", it will be ignored
from the search expression if more then 66% of your content contains the word.

Note: Does not affect future Solr/ElasticSearch search engines which has far more
advanced search options built in.

## Deprecations

* `imagemagick` siteaccess settings are now deprecated. It is mandatory to remove them.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@

use eZ\Publish\Core\Persistence\Legacy\Content\Search\Common\Gateway\CriterionHandler;
use eZ\Publish\Core\Persistence\Legacy\Content\Search\Common\Gateway\CriteriaConverter;
use eZ\Publish\API\Repository\Values\Content\Query\Criterion;
use eZ\Publish\Core\Base\Exceptions\InvalidArgumentException;
use eZ\Publish\Core\Persistence\TransformationProcessor;
use eZ\Publish\Core\Persistence\Database\DatabaseHandler;
use eZ\Publish\API\Repository\Values\Content\Query\Criterion;
use eZ\Publish\Core\Persistence\Database\SelectQuery;

/**
Expand All @@ -27,7 +28,8 @@ class FullText extends CriterionHandler
* @var array
*/
protected $configuration = array(
'searchThresholdValue' => 20,
// @see getStopWordThresholdValue()
'stopWordThresholdFactor' => 0.66,
'enableWildcards' => true,
'commands' => array(
'apostrophe_normalize',
Expand Down Expand Up @@ -65,6 +67,12 @@ class FullText extends CriterionHandler
)
);

/**
* @var int|null
* @see getStopWordThresholdValue()
*/
private $stopWordThresholdValue;

/**
* Transformation processor to normalize search strings
*
Expand All @@ -78,6 +86,8 @@ class FullText extends CriterionHandler
* @param \eZ\Publish\Core\Persistence\Database\DatabaseHandler $dbHandler
* @param \eZ\Publish\Core\Persistence\TransformationProcessor $processor
* @param array $configuration
*
* @throws InvalidArgumentException On invalid $configuration values
*/
public function __construct(
DatabaseHandler $dbHandler,
Expand All @@ -89,6 +99,17 @@ public function __construct(

$this->configuration = $configuration + $this->configuration;
$this->processor = $processor;

if (
$this->configuration['stopWordThresholdFactor'] < 0 ||
$this->configuration['stopWordThresholdFactor'] > 1
)
{
throw new InvalidArgumentException(
"\$configuration['stopWordThresholdFactor']",
"Stop Word Threshold Factor needs to be between 0 and 1, got: " . $this->configuration['stopWordThresholdFactor']
);
}
}

/**
Expand Down Expand Up @@ -161,6 +182,8 @@ protected function getWordExpression( SelectQuery $query, $token )
/**
* Get subquery to select relevant word IDs
*
* @uses getStopWordThresholdValue() To get threshold for words we would like to ignore in query.
*
* @param \eZ\Publish\Core\Persistence\Database\SelectQuery $query
* @param string $string
*
Expand All @@ -178,18 +201,24 @@ protected function getWordIdSubquery( SelectQuery $query, $string )
$wordExpressions[] = $this->getWordExpression( $subQuery, $token );
}

$whereCondition = $subQuery->expr->lOr( $wordExpressions );

// If stop word threshold is below 100%, make it part of $whereCondition
if ( $this->configuration['stopWordThresholdFactor'] < 1 )
{
$whereCondition = $subQuery->expr->lAnd(
$whereCondition,
$subQuery->expr->lt(
$this->dbHandler->quoteColumn( 'object_count' ),
$subQuery->bindValue( $this->getStopWordThresholdValue() )
)
);
}

$subQuery
->select( $this->dbHandler->quoteColumn( 'id' ) )
->from( $this->dbHandler->quoteTable( 'ezsearch_word' ) )
->where(
$subQuery->expr->lAnd(
$subQuery->expr->lOr( $wordExpressions ),
$subQuery->expr->lt(
$this->dbHandler->quoteColumn( 'object_count' ),
$subQuery->bindValue( $this->configuration['searchThresholdValue'] )
)
)
);
->where( $whereCondition );
return $subQuery;
}

Expand Down Expand Up @@ -223,5 +252,41 @@ public function handle( CriteriaConverter $converter, SelectQuery $query, Criter
$subSelect
);
}

/**
* Returns an exact content object count threshold to ignore common terms on.
*
* Common terms will be skipped if used in more then a given percentage of the total amount of content
* objects in the database. Percentage is defined by stopWordThresholdFactor configuration.
*
* Example: If stopWordThresholdFactor is 0.66 (66%), and a term like "the" exists in more then 66% of the content, it
* will ignore the phrase as it is assumed to not add any value ot the search.
*
* Caches the result for the instance used as we don't need this to be super accurate as it is based on percentage,
* set by stopWordThresholdFactor.
*
* @return int
*/
protected function getStopWordThresholdValue()
{
if ( $this->stopWordThresholdValue !== null )
return $this->stopWordThresholdValue;

// Cached value does not exists, do a simple count query on ezcontentobject table
$query = $this->dbHandler->createSelectQuery();
$query
->select(
$query->alias( $query->expr->count( '*' ), 'count' )
)
->from( $this->dbHandler->quoteTable( "ezcontentobject" ) );

$statement = $query->prepare();
$statement->execute();

// Calculate the int stopWordThresholdValue based on count (first column) * factor
return $this->stopWordThresholdValue =
(int)( $statement->fetchColumn() * $this->configuration['stopWordThresholdFactor'] );
}

}

Original file line number Diff line number Diff line change
Expand Up @@ -1136,9 +1136,14 @@ public function testFullTextDisabledWildcardFilter()

public function testFullTextFilterStopwordRemoval()
{
$handler = $this->getLocationSearchHandler(
array(
'stopWordThresholdFactor' => 0.1
)
);
$this->assertSearchResults(
array(),
$this->getLocationSearchHandler()->findLocations(
$handler->findLocations(
new LocationQuery(
array(
'filter' => new Criterion\FullText( 'the' ),
Expand All @@ -1153,7 +1158,7 @@ public function testFullTextFilterNoStopwordRemoval()
{
$handler = $this->getLocationSearchHandler(
array(
'searchThresholdValue' => PHP_INT_MAX
'stopWordThresholdFactor' => 1
)
);

Expand Down Expand Up @@ -1184,6 +1189,18 @@ function ( $hit )
);
}

/**
* @expectedException \eZ\Publish\API\Repository\Exceptions\InvalidArgumentException
*/
public function testFullTextFilterInvalidStopwordThreshold()
{
$this->getLocationSearchHandler(
array(
'stopWordThresholdFactor' => 2
)
);
}

public function testFieldRelationFilterContainsSingle()
{
$this->assertSearchResults(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1213,9 +1213,15 @@ public function testFullTextDisabledWildcardFilter()

public function testFullTextFilterStopwordRemoval()
{
$handler = $this->getContentSearchHandler(
array(
'stopWordThresholdFactor' => 0.1
)
);

$this->assertSearchResults(
array(),
$this->getContentSearchHandler()->findContent(
$handler->findContent(
new Query(
array(
'filter' => new Criterion\FullText( 'the' ),
Expand All @@ -1228,13 +1234,13 @@ public function testFullTextFilterStopwordRemoval()

public function testFullTextFilterNoStopwordRemoval()
{
$locator = $this->getContentSearchHandler(
$handler = $this->getContentSearchHandler(
array(
'searchThresholdValue' => PHP_INT_MAX
'stopWordThresholdFactor' => 1
)
);

$result = $locator->findContent(
$result = $handler->findContent(
new Query(
array(
'filter' => new Criterion\FullText(
Expand All @@ -1259,6 +1265,18 @@ function ( $hit )
);
}

/**
* @expectedException \eZ\Publish\API\Repository\Exceptions\InvalidArgumentException
*/
public function testFullTextFilterInvalidStopwordThreshold()
{
$this->getContentSearchHandler(
array(
'stopWordThresholdFactor' => 2
)
);
}

public function testObjectStateIdFilter()
{
$this->assertSearchResults(
Expand Down

0 comments on commit b7f1b1a

Please sign in to comment.