Skip to content

Commit

Permalink
Merge pull request #264 from esmero/ISSUE-263
Browse files Browse the repository at this point in the history
ISSUE-263: Improve HL for JOINS + Phrase Linking + Advanced Search HL
  • Loading branch information
DiegoPino authored Apr 14, 2023
2 parents 041183e + 2a79207 commit 50144f6
Show file tree
Hide file tree
Showing 8 changed files with 327 additions and 112 deletions.
3 changes: 3 additions & 0 deletions config/schema/strawberryfield.schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,9 @@ plugin.plugin_configuration.search_api_processor.sbf_highlight:
highlight_processing:
type: string
label: 'Defines whether highlight and excerpt should be processed from backend highlighter or via post processing.'
highlight_backend_use_keys:
type: boolean
label: 'Whether highlighted keys returned by Solr should be used.'
highlight_partial:
type: boolean
label: 'Whether matches in parts of words should be highlighted'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,6 @@ protected function flavorfromSolrIndex(string $term, int $nodeid, string $proces

$parse_mode = $this->parseModeManager->createInstance('terms');
$query->setParseMode($parse_mode);
$query->sort('search_api_relevance', 'DESC');
$query->keys($term);

$query->setFulltextFields(['ocr_text']);
Expand Down Expand Up @@ -252,6 +251,7 @@ protected function flavorfromSolrIndex(string $term, int $nodeid, string $proces
}
if (isset($allfields_translated_to_solr['sequence_id'])) {
$fields_to_retrieve['sequence_id'] = $allfields_translated_to_solr['sequence_id'];
$query->sort('sequence_id', QueryInterface::SORT_ASC);
}
if (isset($allfields_translated_to_solr['file_uuid'])) {
$fields_to_retrieve['file_uuid'] = $allfields_translated_to_solr['file_uuid'];
Expand All @@ -275,7 +275,7 @@ protected function flavorfromSolrIndex(string $term, int $nodeid, string $proces
4 => 'ss_search_api_language:("en" "und" "zxx")',
),
*/

$query->sort('search_api_relevance', 'DESC');
$query->setProcessingLevel(QueryInterface::PROCESSING_BASIC);
$results = $query->execute();
$extradata = $results->getAllExtraData() ?? [];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ protected function trackDeleted(EntityInterface $entity) {
$query = $index->query(['offset' => 0, 'limit' => $limit]);
$query->addCondition('search_api_datasource', $datasource_id)
->addCondition('uuid', $entity->uuid());
$query->setOption('search_api_retrieved_field_values', ['id']);
$query->setOption('search_api_retrieved_field_values', ['id' => 'id']);
// Query breaks if not because standard hl is enabled for all fields.
// and normal hl offsets on OCR HL specific ones.
$query->setOption('ocr_highlight', 'on');
Expand All @@ -91,6 +91,7 @@ protected function trackDeleted(EntityInterface $entity) {
}
// If there are still more left, change the range and query again.
if (count($tracked_ids) < $max) {
$query = $query->getOriginalQuery();
$query->range($limit * $i, $limit);
$results = $query->execute();
$newcount = $results->getResultCount();
Expand Down
3 changes: 2 additions & 1 deletion src/EventSubscriber/StrawberryEventSaveFlavorSubscriber.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ protected function trackFilesDeleted(EntityInterface $entity, FileInterface $fil
$query->addCondition('file_uuid', $file->uuid())
->addCondition('search_api_datasource', $datasource_id)
->addCondition('uuid', $entity->uuid());
$query->setOption('search_api_retrieved_field_values', ['id']);
$query->setOption('search_api_retrieved_field_values', ['id' => 'id']);
// Query breaks if not because standard hl is enabled for all fields.
// and normal hl offsets on OCR HL specific ones.
$query->setOption('ocr_highlight', 'on');
Expand All @@ -131,6 +131,7 @@ protected function trackFilesDeleted(EntityInterface $entity, FileInterface $fil
}
// If there are still more left, change the range and query again.
if (count($tracked_ids) < $max) {
$query = $query->getOriginalQuery();
$query->range($limit * $i, $limit);
$results = $query->execute();
$newcount = $results->getResultCount();
Expand Down
288 changes: 219 additions & 69 deletions src/Plugin/search_api/processor/StrawberryFieldHighlight.php

Large diffs are not rendered by default.

68 changes: 56 additions & 12 deletions src/Plugin/search_api/processor/StrawberryFlavorAggregate.php
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,11 @@ public function addFieldValues(ItemInterface $item) {
array_map('trim', $processor_ids)
);
foreach ($processor_ids as $processor_id) {

$flavors = $this->flavorsfromSolrIndex(
$node->id(), $processor_id, $indexes, $limit = 500
$node->id(), $processor_id, $indexes, 50, 500
);

$flavors = array_filter($flavors);
if (count($flavors)) {
$flavors = array_values($flavors);
Expand Down Expand Up @@ -201,12 +203,13 @@ public function addFieldValues(ItemInterface $item) {
* @param string $file_uuid
* @param array $indexes
* @param int $limit
*
* The number of SBF Documents to get per query
* @param int $max number of SBF to fetch at all.
* @return array[]
* @throws \Drupal\Component\Plugin\Exception\PluginException
* @throws \Drupal\search_api\SearchApiException
*/
protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $indexes, $limit = 500) {
protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $indexes, $limit = 50, $max = 500) {
$values = [];
/* @var \Drupal\search_api\IndexInterface[] $indexes */
foreach ($indexes as $search_api_index) {
Expand Down Expand Up @@ -261,28 +264,47 @@ protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $i
'sequence_id'
);

$query->sort('search_api_relevance', 'DESC');
$sorted = FALSE;
// Override Sort if we have a sequence ID for this data source
foreach ($fields_with_sequence_id as $field_with_sequence_id) {
// \Drupal\search_api\Plugin\search_api\data_type\IntegerDataType
if ($field_with_sequence_id->getType() == 'string' || $field_with_sequence_id->getType() == 'integer') {
$query->sort($field_with_sequence_id->getFieldIdentifier(), 'DESC');
if ($field_with_sequence_id->getType() == 'integer'
) {
$query->sort($field_with_sequence_id->getFieldIdentifier(), 'ASC');
$sorted = TRUE;
break;
}
}
if (!$sorted) {
// No difference of sorting by string than the id itself.
$query->sort('search_api_id', 'ASC');
}

$field_with_plaintex = $this->getFieldsHelper()
$fields_with_plaintext = $this->getFieldsHelper()
->filterForPropertyPath(
$search_api_index->getFields(), 'strawberryfield_flavor_datasource',
'plaintext'
);
// Needed to avoid statically caching the results
// $query->getOriginalQuery() is not reliable and eventually
// gets poluted (marked as processed)
// Drupal why is your code so messy?
$query->setProcessingLevel(QueryInterface::PROCESSING_NONE);

$query->setProcessingLevel(QueryInterface::PROCESSING_BASIC);
try {
$fields = ['search_api_relevance','search_api_datasource','search_api_language','search_api_id'];
foreach ($fields_with_plaintext as $key => $field_data) {
$fields[] = $key;
}
$fields = array_combine($fields, $fields);
$query->setOption('search_api_retrieved_field_values', $fields);
$results = $query->execute();
}
catch (\Exception $exception) {
$this->logException($exception, '%type while trying to fetch Strawberry Flavors from Search API');
$this->logException(
$exception,
'%type while trying to fetch Strawberry Flavors from Search API'
);
return $values;
}
// remove the ID and the parent, not needed for file matching
Expand All @@ -292,9 +314,18 @@ protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $i
]
];

if ($results->getResultCount() >= 1) {
$i = 0;
$j = 0;
$max_from_backend = $results->getResultCount();
$max_from_backend = $newcount = $max_from_backend > $max ? $max : $max_from_backend;

while ($j < $max_from_backend && $newcount > 0) {
$i++;
foreach ($results->getResultItems() as $resultItem) {
$property_values = $this->getFieldsHelper()->extractItemValues([$resultItem], $required_properties_by_datasource, false);
$j++;
$property_values = $this->getFieldsHelper()->extractItemValues(
[$resultItem], $required_properties_by_datasource, FALSE
);
foreach ($property_values as $plaintext) {
if (($plaintext['plaintext'][0] ?? NULL) instanceof TextValue) {
// Wonder if we can use __toString() here as a magic prop
Expand All @@ -308,12 +339,25 @@ protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $i
$text_to_clean = str_replace("-\n ", "", $text_to_clean);
$text_to_clean = str_replace("\n ", " ", $text_to_clean);
$text_to_clean = str_replace("\n", " ", $text_to_clean);
$text_to_clean = preg_replace(['/\h{2,}|(\h*\v{1,})/umi', '/\v{2,}/uim', '/\h{2,}/uim'], [' ', ' ', ' '], $text_to_clean);
$text_to_clean = preg_replace(
['/\h{2,}|(\h*\v{1,})/umi', '/\v{2,}/uim', '/\h{2,}/uim'],
[' ', ' ', ' '], $text_to_clean
);
if (strlen(trim($text_to_clean)) > 0) {
$values[] = $text_to_clean;
}
}
}
if ($j < $max_from_backend && $j > 0) {
// Reusing the query can not be done bc it will return the original query results
// statically cached
// I could clone and clone but that would use extra memory
// so i remove PROCESSING to avoid returning the same 50!
$query = $query->getOriginalQuery();
$query->range($limit * $i, $limit);
$results = $query->execute();
$newcount = $results->getResultCount();
}
}
}
return $values;
Expand Down
22 changes: 15 additions & 7 deletions src/Plugin/search_api/processor/StrawberryReduceReturn.php
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,9 @@ protected function setConfigFactory(ConfigFactoryInterface $config_factory) {
*/
public function preprocessSearchQuery(QueryInterface $query) {
// We really don't want to return the aggregated fields this processor
// Provides
// Unnecessary HUGE payload.
// Provides. Unnecessary HUGE payload.
if (isset($query->getOptions()['search_api_view']) && $query->getOptions()['search_api_view']->getDisplay()->usesFields()) {
//don't override any other options set by someone else.

if (empty($query->getOptions()['search_api_retrieved_field_values'] ?? [])) {
$fields = [];
// Get me all the fields of this index (gosh)
Expand All @@ -90,14 +88,24 @@ public function preprocessSearchQuery(QueryInterface $query) {
unset($fields[$key]);
}
$fields = array_values(array_keys($fields));
$fields += ['search_api_relevance','search_api_datasource','search_api_language','search_api_id'];
// See \Drupal\search_api_solr\Plugin\search_api\backend\SearchApiSolrBackend::getRequiredFields for a better list
// Sadly that method is protected so we will need to re-write it here?
$fields += ['search_api_relevance','search_api_datasource','search_api_language','search_api_id','site_hash'];
$query->setOption('search_api_retrieved_field_values', $fields);
$query->setOption('highlight_reduce_return', ['*']);
// don't override highlight fields if any other SBF module/processor has already done this
if ($query->getOption('sbf_highlight_fields', NULL)) {
$query->setOption('sbf_highlight_fields', ['*']);
}
}
}
elseif (isset($query->getOptions()['search_api_view']) && !$query->getOptions()['search_api_view']->getDisplay()->usesFields()) {
$query->setOption('search_api_retrieved_field_values', ['search_api_relevance','search_api_datasource','search_api_language','search_api_id']);
$query->setOption('highlight_reduce_return', ['*']);
// See \Drupal\search_api_solr\Plugin\search_api\backend\SearchApiSolrBackend::getRequiredFields for a better list
// Sadly that method is protected so we will need to re-write it here?
$query->setOption('search_api_retrieved_field_values', ['search_api_relevance','search_api_datasource','search_api_language','search_api_id','site_hash']);
// don't override highlight fields if any other SBF module/processor has already done this
if ($query->getOption('sbf_highlight_fields', NULL)) {
$query->setOption('sbf_highlight_fields', ['*']);
}
}
}
}
48 changes: 28 additions & 20 deletions strawberryfield.module
Original file line number Diff line number Diff line change
Expand Up @@ -369,30 +369,31 @@ function strawberryfield_search_api_solr_query_alter(SolariumQueryInterface $sol

$solarium_query->addParam('hl.ocr.fl', $solr_field_names['ocr_text']);
$solarium_query->addParam('hl.ocr.absoluteHighlights', 'on');
$solarium_query->addParam('hl.method', 'UnifiedHighlighter');
// Only place where unified is justified
$hl->setMethod('unified');
}
}
elseif ($query->getOption('highlight_reduce_return', FALSE)) {
elseif ($query->getOption('sbf_highlight_fields', FALSE)) {
//advanced_highlight_return
// ELSEIF bc OCR and these ones are incompatible
/* @var \Solarium\Component\Highlighting\Highlighting $hl */
$hl = $solarium_query->getHighlighting();
$highlight_fields = $query->getOption('highlight_reduce_return',[]);
if ($highlight_fields != ['*']) {
$highlight_fields = array_filter(
$highlight_fields, function ($v) {
return preg_match('/^t.?[sm]_/', $v) || preg_match('/^s[sm]_/', $v);
}
);
}
else {
$hl->setRequireFieldMatch(TRUE);
}

$highlight_fields = $query->getOption('sbf_highlight_fields',[]);
foreach ($highlight_fields as $highlighted_field) {
// We must not set the fields at once using setFields() to not break
// the altered queries.
$hl->addField($highlighted_field);
$hl->addField($highlighted_field);
}

// Force HL to original for now. We can make this an option
// but given the Drupal nature of treating all Full Text fields as the same
// If a given Full text does not contain the vector index data required this will
// fail. Unified does not play with JOINs on Solr 9.1 throwing
// a class mismatch even if we are not asking for Highlights from the flavor.
// @TODO revisit for Solr 9.2.x
$hl->setUsePhraseHighlighter(TRUE);
$hl->setMethod('original');
$hl->setFragSize(128);
$hl->setRequireFieldMatch(TRUE);
}
}
Expand All @@ -405,19 +406,26 @@ function strawberryfield_search_api_solr_converted_query_alter(SolariumQueryInte
ComponentAwareQueryInterface::COMPONENT_EDISMAX
);
$solarium_query->addParam('defType', 'lucene');
/* @var \Solarium\Component\Highlighting\Highlighting $hl */
$hl = $solarium_query->getHighlighting();

$hl->setUsePhraseHighlighter(TRUE);
$hl->setDefaultSummary(TRUE);
$hl->setMethod('original');
$hl->setRequireFieldMatch(TRUE);
//$hl->setMethod('unified'); This requires omitTermFreqAndPositions=FALSE;
// @TODO make this a setting? 128 feels like a decent number TBH.
$hl->setFragsizeIsMinimum(FALSE);
$hl->setMergeContiguous(TRUE);
$hl->setFragSize(128);
$hl->setDefaultSummary(TRUE);
if ($combined_keys = $query->getOption('sbf_join_flavor')['hl'] ?? NULL) {
$hl->setQuery($combined_keys);
}

// Because the Query Sets a few Fields to retrieve (to make it faster)
// But Search API is silly and decides that when that happens
// I want only those fields highlighted
// By setting the to all but limiting it to setRequireFieldMatch we only the matched ones.
// This fails with JOINS and unified so we set method original.
$hl->setFields(['*']);
$hl->setFragsizeIsMinimum(FALSE);

}
}
}
Expand Down

0 comments on commit 50144f6

Please sign in to comment.