diff --git a/doc/bc/changes-5.3.md b/doc/bc/changes-5.3.md index 255446b78..d85513b0c 100644 --- a/doc/bc/changes-5.3.md +++ b/doc/bc/changes-5.3.md @@ -75,6 +75,17 @@ Changes affecting version compatibility with former or future versions. * 5.3.4: `ViewCaching` legacy setting is now enforced and injected in legacy kernel when booted. This is to avoid persistence/Http cache clear not working when publishing content. +* 5.3.5: Legacy Search Engine FullText searchThresholdValue -> stopWordThresholdFactor + + EZP-24213: the "Stop Word Threshold" configuration, `searchThresholdValue`, was hardcoded + to 20 items. It is now changed to `stopWordThresholdFactor`, a factor (between 0 and 1) + for the percentage of content objects to set the Stop Word Threshold to. Default value + is set to 0.66, meaning if you search for a common word like "the", it will be ignored + from the search expression if more then 66% of your content contains the word. + + Note: Does not affect future Solr/ElasticSearch search engines which has far more + advanced search options built in. + ## Deprecations * Method `eZ\Publish\API\Repository\RoleService::removePolicy` is deprecated in diff --git a/doc/bc/changes-5.4.md b/doc/bc/changes-5.4.md index 4313c01ff..c60216210 100644 --- a/doc/bc/changes-5.4.md +++ b/doc/bc/changes-5.4.md @@ -87,6 +87,17 @@ Changes affecting version compatibility with former or future versions. * `ViewCaching` legacy setting is now enforced and injected in legacy kernel when booted. This is to avoid persistence/Http cache clear not working when publishing content. +* 5.4.2: Legacy Search Engine FullText searchThresholdValue -> stopWordThresholdFactor + + EZP-24213: the "Stop Word Threshold" configuration, `searchThresholdValue`, was hardcoded + to 20 items. It is now changed to `stopWordThresholdFactor`, a factor (between 0 and 1) + for the percentage of content objects to set the Stop Word Threshold to. Default value + is set to 0.66, meaning if you search for a common word like "the", it will be ignored + from the search expression if more then 66% of your content contains the word. + + Note: Does not affect future Solr/ElasticSearch search engines which has far more + advanced search options built in. + ## Deprecations * `imagemagick` siteaccess settings are now deprecated. It is mandatory to remove them. diff --git a/eZ/Publish/Core/Persistence/Legacy/Content/Search/Common/Gateway/CriterionHandler/FullText.php b/eZ/Publish/Core/Persistence/Legacy/Content/Search/Common/Gateway/CriterionHandler/FullText.php index fa929b9d2..52b481a2c 100644 --- a/eZ/Publish/Core/Persistence/Legacy/Content/Search/Common/Gateway/CriterionHandler/FullText.php +++ b/eZ/Publish/Core/Persistence/Legacy/Content/Search/Common/Gateway/CriterionHandler/FullText.php @@ -11,9 +11,10 @@ use eZ\Publish\Core\Persistence\Legacy\Content\Search\Common\Gateway\CriterionHandler; use eZ\Publish\Core\Persistence\Legacy\Content\Search\Common\Gateway\CriteriaConverter; +use eZ\Publish\API\Repository\Values\Content\Query\Criterion; +use eZ\Publish\Core\Base\Exceptions\InvalidArgumentException; use eZ\Publish\Core\Persistence\TransformationProcessor; use eZ\Publish\Core\Persistence\Database\DatabaseHandler; -use eZ\Publish\API\Repository\Values\Content\Query\Criterion; use eZ\Publish\Core\Persistence\Database\SelectQuery; /** @@ -27,7 +28,8 @@ class FullText extends CriterionHandler * @var array */ protected $configuration = array( - 'searchThresholdValue' => 20, + // @see getStopWordThresholdValue() + 'stopWordThresholdFactor' => 0.66, 'enableWildcards' => true, 'commands' => array( 'apostrophe_normalize', @@ -65,6 +67,12 @@ class FullText extends CriterionHandler ) ); + /** + * @var int|null + * @see getStopWordThresholdValue() + */ + private $stopWordThresholdValue; + /** * Transformation processor to normalize search strings * @@ -78,6 +86,8 @@ class FullText extends CriterionHandler * @param \eZ\Publish\Core\Persistence\Database\DatabaseHandler $dbHandler * @param \eZ\Publish\Core\Persistence\TransformationProcessor $processor * @param array $configuration + * + * @throws InvalidArgumentException On invalid $configuration values */ public function __construct( DatabaseHandler $dbHandler, @@ -89,6 +99,17 @@ public function __construct( $this->configuration = $configuration + $this->configuration; $this->processor = $processor; + + if ( + $this->configuration['stopWordThresholdFactor'] < 0 || + $this->configuration['stopWordThresholdFactor'] > 1 + ) + { + throw new InvalidArgumentException( + "\$configuration['stopWordThresholdFactor']", + "Stop Word Threshold Factor needs to be between 0 and 1, got: " . $this->configuration['stopWordThresholdFactor'] + ); + } } /** @@ -161,6 +182,8 @@ protected function getWordExpression( SelectQuery $query, $token ) /** * Get subquery to select relevant word IDs * + * @uses getStopWordThresholdValue() To get threshold for words we would like to ignore in query. + * * @param \eZ\Publish\Core\Persistence\Database\SelectQuery $query * @param string $string * @@ -178,18 +201,24 @@ protected function getWordIdSubquery( SelectQuery $query, $string ) $wordExpressions[] = $this->getWordExpression( $subQuery, $token ); } + $whereCondition = $subQuery->expr->lOr( $wordExpressions ); + + // If stop word threshold is below 100%, make it part of $whereCondition + if ( $this->configuration['stopWordThresholdFactor'] < 1 ) + { + $whereCondition = $subQuery->expr->lAnd( + $whereCondition, + $subQuery->expr->lt( + $this->dbHandler->quoteColumn( 'object_count' ), + $subQuery->bindValue( $this->getStopWordThresholdValue() ) + ) + ); + } + $subQuery ->select( $this->dbHandler->quoteColumn( 'id' ) ) ->from( $this->dbHandler->quoteTable( 'ezsearch_word' ) ) - ->where( - $subQuery->expr->lAnd( - $subQuery->expr->lOr( $wordExpressions ), - $subQuery->expr->lt( - $this->dbHandler->quoteColumn( 'object_count' ), - $subQuery->bindValue( $this->configuration['searchThresholdValue'] ) - ) - ) - ); + ->where( $whereCondition ); return $subQuery; } @@ -223,5 +252,41 @@ public function handle( CriteriaConverter $converter, SelectQuery $query, Criter $subSelect ); } + + /** + * Returns an exact content object count threshold to ignore common terms on. + * + * Common terms will be skipped if used in more then a given percentage of the total amount of content + * objects in the database. Percentage is defined by stopWordThresholdFactor configuration. + * + * Example: If stopWordThresholdFactor is 0.66 (66%), and a term like "the" exists in more then 66% of the content, it + * will ignore the phrase as it is assumed to not add any value ot the search. + * + * Caches the result for the instance used as we don't need this to be super accurate as it is based on percentage, + * set by stopWordThresholdFactor. + * + * @return int + */ + protected function getStopWordThresholdValue() + { + if ( $this->stopWordThresholdValue !== null ) + return $this->stopWordThresholdValue; + + // Cached value does not exists, do a simple count query on ezcontentobject table + $query = $this->dbHandler->createSelectQuery(); + $query + ->select( + $query->alias( $query->expr->count( '*' ), 'count' ) + ) + ->from( $this->dbHandler->quoteTable( "ezcontentobject" ) ); + + $statement = $query->prepare(); + $statement->execute(); + + // Calculate the int stopWordThresholdValue based on count (first column) * factor + return $this->stopWordThresholdValue = + (int)( $statement->fetchColumn() * $this->configuration['stopWordThresholdFactor'] ); + } + } diff --git a/eZ/Publish/Core/Persistence/Legacy/Tests/Content/LocationSearchHandlerTest.php b/eZ/Publish/Core/Persistence/Legacy/Tests/Content/LocationSearchHandlerTest.php index a7635697b..4a9ee4c89 100644 --- a/eZ/Publish/Core/Persistence/Legacy/Tests/Content/LocationSearchHandlerTest.php +++ b/eZ/Publish/Core/Persistence/Legacy/Tests/Content/LocationSearchHandlerTest.php @@ -1136,9 +1136,14 @@ public function testFullTextDisabledWildcardFilter() public function testFullTextFilterStopwordRemoval() { + $handler = $this->getLocationSearchHandler( + array( + 'stopWordThresholdFactor' => 0.1 + ) + ); $this->assertSearchResults( array(), - $this->getLocationSearchHandler()->findLocations( + $handler->findLocations( new LocationQuery( array( 'filter' => new Criterion\FullText( 'the' ), @@ -1153,7 +1158,7 @@ public function testFullTextFilterNoStopwordRemoval() { $handler = $this->getLocationSearchHandler( array( - 'searchThresholdValue' => PHP_INT_MAX + 'stopWordThresholdFactor' => 1 ) ); @@ -1184,6 +1189,18 @@ function ( $hit ) ); } + /** + * @expectedException \eZ\Publish\API\Repository\Exceptions\InvalidArgumentException + */ + public function testFullTextFilterInvalidStopwordThreshold() + { + $this->getLocationSearchHandler( + array( + 'stopWordThresholdFactor' => 2 + ) + ); + } + public function testFieldRelationFilterContainsSingle() { $this->assertSearchResults( diff --git a/eZ/Publish/Core/Persistence/Legacy/Tests/Content/SearchHandlerTest.php b/eZ/Publish/Core/Persistence/Legacy/Tests/Content/SearchHandlerTest.php index de5d07f22..aecaa4a1e 100644 --- a/eZ/Publish/Core/Persistence/Legacy/Tests/Content/SearchHandlerTest.php +++ b/eZ/Publish/Core/Persistence/Legacy/Tests/Content/SearchHandlerTest.php @@ -1213,9 +1213,15 @@ public function testFullTextDisabledWildcardFilter() public function testFullTextFilterStopwordRemoval() { + $handler = $this->getContentSearchHandler( + array( + 'stopWordThresholdFactor' => 0.1 + ) + ); + $this->assertSearchResults( array(), - $this->getContentSearchHandler()->findContent( + $handler->findContent( new Query( array( 'filter' => new Criterion\FullText( 'the' ), @@ -1228,13 +1234,13 @@ public function testFullTextFilterStopwordRemoval() public function testFullTextFilterNoStopwordRemoval() { - $locator = $this->getContentSearchHandler( + $handler = $this->getContentSearchHandler( array( - 'searchThresholdValue' => PHP_INT_MAX + 'stopWordThresholdFactor' => 1 ) ); - $result = $locator->findContent( + $result = $handler->findContent( new Query( array( 'filter' => new Criterion\FullText( @@ -1259,6 +1265,18 @@ function ( $hit ) ); } + /** + * @expectedException \eZ\Publish\API\Repository\Exceptions\InvalidArgumentException + */ + public function testFullTextFilterInvalidStopwordThreshold() + { + $this->getContentSearchHandler( + array( + 'stopWordThresholdFactor' => 2 + ) + ); + } + public function testObjectStateIdFilter() { $this->assertSearchResults(