Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ISSUE-263: Improve HL for JOINS + Phrase Linking. #264

Merged
merged 12 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/schema/strawberryfield.schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,9 @@ plugin.plugin_configuration.search_api_processor.sbf_highlight:
highlight_processing:
type: string
label: 'Defines whether highlight and excerpt should be processed from backend highlighter or via post processing.'
highlight_backend_use_keys:
type: boolean
label: 'Whether highlighted keys returned by Solr should be used.'
highlight_partial:
type: boolean
label: 'Whether matches in parts of words should be highlighted'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,6 @@ protected function flavorfromSolrIndex(string $term, int $nodeid, string $proces

$parse_mode = $this->parseModeManager->createInstance('terms');
$query->setParseMode($parse_mode);
$query->sort('search_api_relevance', 'DESC');
$query->keys($term);

$query->setFulltextFields(['ocr_text']);
Expand Down Expand Up @@ -252,6 +251,7 @@ protected function flavorfromSolrIndex(string $term, int $nodeid, string $proces
}
if (isset($allfields_translated_to_solr['sequence_id'])) {
$fields_to_retrieve['sequence_id'] = $allfields_translated_to_solr['sequence_id'];
$query->sort('sequence_id', QueryInterface::SORT_ASC);
}
if (isset($allfields_translated_to_solr['file_uuid'])) {
$fields_to_retrieve['file_uuid'] = $allfields_translated_to_solr['file_uuid'];
Expand All @@ -275,7 +275,7 @@ protected function flavorfromSolrIndex(string $term, int $nodeid, string $proces
4 => 'ss_search_api_language:("en" "und" "zxx")',
),
*/

$query->sort('search_api_relevance', 'DESC');
$query->setProcessingLevel(QueryInterface::PROCESSING_BASIC);
$results = $query->execute();
$extradata = $results->getAllExtraData() ?? [];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ protected function trackDeleted(EntityInterface $entity) {
$query = $index->query(['offset' => 0, 'limit' => $limit]);
$query->addCondition('search_api_datasource', $datasource_id)
->addCondition('uuid', $entity->uuid());
$query->setOption('search_api_retrieved_field_values', ['id']);
$query->setOption('search_api_retrieved_field_values', ['id' => 'id']);
// Query breaks if not because standard hl is enabled for all fields.
// and normal hl offsets on OCR HL specific ones.
$query->setOption('ocr_highlight', 'on');
Expand All @@ -91,6 +91,7 @@ protected function trackDeleted(EntityInterface $entity) {
}
// If there are still more left, change the range and query again.
if (count($tracked_ids) < $max) {
$query = $query->getOriginalQuery();
$query->range($limit * $i, $limit);
$results = $query->execute();
$newcount = $results->getResultCount();
Expand Down
3 changes: 2 additions & 1 deletion src/EventSubscriber/StrawberryEventSaveFlavorSubscriber.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ protected function trackFilesDeleted(EntityInterface $entity, FileInterface $fil
$query->addCondition('file_uuid', $file->uuid())
->addCondition('search_api_datasource', $datasource_id)
->addCondition('uuid', $entity->uuid());
$query->setOption('search_api_retrieved_field_values', ['id']);
$query->setOption('search_api_retrieved_field_values', ['id' => 'id']);
// Query breaks if not because standard hl is enabled for all fields.
// and normal hl offsets on OCR HL specific ones.
$query->setOption('ocr_highlight', 'on');
Expand All @@ -131,6 +131,7 @@ protected function trackFilesDeleted(EntityInterface $entity, FileInterface $fil
}
// If there are still more left, change the range and query again.
if (count($tracked_ids) < $max) {
$query = $query->getOriginalQuery();
$query->range($limit * $i, $limit);
$results = $query->execute();
$newcount = $results->getResultCount();
Expand Down
288 changes: 219 additions & 69 deletions src/Plugin/search_api/processor/StrawberryFieldHighlight.php

Large diffs are not rendered by default.

68 changes: 56 additions & 12 deletions src/Plugin/search_api/processor/StrawberryFlavorAggregate.php
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,11 @@ public function addFieldValues(ItemInterface $item) {
array_map('trim', $processor_ids)
);
foreach ($processor_ids as $processor_id) {

$flavors = $this->flavorsfromSolrIndex(
$node->id(), $processor_id, $indexes, $limit = 500
$node->id(), $processor_id, $indexes, 50, 500
);

$flavors = array_filter($flavors);
if (count($flavors)) {
$flavors = array_values($flavors);
Expand Down Expand Up @@ -201,12 +203,13 @@ public function addFieldValues(ItemInterface $item) {
* @param string $file_uuid
* @param array $indexes
* @param int $limit
*
* The number of SBF Documents to get per query
* @param int $max number of SBF to fetch at all.
* @return array[]
* @throws \Drupal\Component\Plugin\Exception\PluginException
* @throws \Drupal\search_api\SearchApiException
*/
protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $indexes, $limit = 500) {
protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $indexes, $limit = 50, $max = 500) {
$values = [];
/* @var \Drupal\search_api\IndexInterface[] $indexes */
foreach ($indexes as $search_api_index) {
Expand Down Expand Up @@ -261,28 +264,47 @@ protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $i
'sequence_id'
);

$query->sort('search_api_relevance', 'DESC');
$sorted = FALSE;
// Override Sort if we have a sequence ID for this data source
foreach ($fields_with_sequence_id as $field_with_sequence_id) {
// \Drupal\search_api\Plugin\search_api\data_type\IntegerDataType
if ($field_with_sequence_id->getType() == 'string' || $field_with_sequence_id->getType() == 'integer') {
$query->sort($field_with_sequence_id->getFieldIdentifier(), 'DESC');
if ($field_with_sequence_id->getType() == 'integer'
) {
$query->sort($field_with_sequence_id->getFieldIdentifier(), 'ASC');
$sorted = TRUE;
break;
}
}
if (!$sorted) {
// No difference of sorting by string than the id itself.
$query->sort('search_api_id', 'ASC');
}

$field_with_plaintex = $this->getFieldsHelper()
$fields_with_plaintext = $this->getFieldsHelper()
->filterForPropertyPath(
$search_api_index->getFields(), 'strawberryfield_flavor_datasource',
'plaintext'
);
// Needed to avoid statically caching the results
// $query->getOriginalQuery() is not reliable and eventually
// gets poluted (marked as processed)
// Drupal why is your code so messy?
$query->setProcessingLevel(QueryInterface::PROCESSING_NONE);

$query->setProcessingLevel(QueryInterface::PROCESSING_BASIC);
try {
$fields = ['search_api_relevance','search_api_datasource','search_api_language','search_api_id'];
foreach ($fields_with_plaintext as $key => $field_data) {
$fields[] = $key;
}
$fields = array_combine($fields, $fields);
$query->setOption('search_api_retrieved_field_values', $fields);
$results = $query->execute();
}
catch (\Exception $exception) {
$this->logException($exception, '%type while trying to fetch Strawberry Flavors from Search API');
$this->logException(
$exception,
'%type while trying to fetch Strawberry Flavors from Search API'
);
return $values;
}
// remove the ID and the parent, not needed for file matching
Expand All @@ -292,9 +314,18 @@ protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $i
]
];

if ($results->getResultCount() >= 1) {
$i = 0;
$j = 0;
$max_from_backend = $results->getResultCount();
$max_from_backend = $newcount = $max_from_backend > $max ? $max : $max_from_backend;

while ($j < $max_from_backend && $newcount > 0) {
$i++;
foreach ($results->getResultItems() as $resultItem) {
$property_values = $this->getFieldsHelper()->extractItemValues([$resultItem], $required_properties_by_datasource, false);
$j++;
$property_values = $this->getFieldsHelper()->extractItemValues(
[$resultItem], $required_properties_by_datasource, FALSE
);
foreach ($property_values as $plaintext) {
if (($plaintext['plaintext'][0] ?? NULL) instanceof TextValue) {
// Wonder if we can use __toString() here as a magic prop
Expand All @@ -308,12 +339,25 @@ protected function flavorsfromSolrIndex(int $nodeid, string $processor, array $i
$text_to_clean = str_replace("-\n ", "", $text_to_clean);
$text_to_clean = str_replace("\n ", " ", $text_to_clean);
$text_to_clean = str_replace("\n", " ", $text_to_clean);
$text_to_clean = preg_replace(['/\h{2,}|(\h*\v{1,})/umi', '/\v{2,}/uim', '/\h{2,}/uim'], [' ', ' ', ' '], $text_to_clean);
$text_to_clean = preg_replace(
['/\h{2,}|(\h*\v{1,})/umi', '/\v{2,}/uim', '/\h{2,}/uim'],
[' ', ' ', ' '], $text_to_clean
);
if (strlen(trim($text_to_clean)) > 0) {
$values[] = $text_to_clean;
}
}
}
if ($j < $max_from_backend && $j > 0) {
// Reusing the query can not be done bc it will return the original query results
// statically cached
// I could clone and clone but that would use extra memory
// so i remove PROCESSING to avoid returning the same 50!
$query = $query->getOriginalQuery();
$query->range($limit * $i, $limit);
$results = $query->execute();
$newcount = $results->getResultCount();
}
}
}
return $values;
Expand Down
22 changes: 15 additions & 7 deletions src/Plugin/search_api/processor/StrawberryReduceReturn.php
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,9 @@ protected function setConfigFactory(ConfigFactoryInterface $config_factory) {
*/
public function preprocessSearchQuery(QueryInterface $query) {
// We really don't want to return the aggregated fields this processor
// Provides
// Unnecessary HUGE payload.
// Provides. Unnecessary HUGE payload.
if (isset($query->getOptions()['search_api_view']) && $query->getOptions()['search_api_view']->getDisplay()->usesFields()) {
//don't override any other options set by someone else.

if (empty($query->getOptions()['search_api_retrieved_field_values'] ?? [])) {
$fields = [];
// Get me all the fields of this index (gosh)
Expand All @@ -90,14 +88,24 @@ public function preprocessSearchQuery(QueryInterface $query) {
unset($fields[$key]);
}
$fields = array_values(array_keys($fields));
$fields += ['search_api_relevance','search_api_datasource','search_api_language','search_api_id'];
// See \Drupal\search_api_solr\Plugin\search_api\backend\SearchApiSolrBackend::getRequiredFields for a better list
// Sadly that method is protected so we will need to re-write it here?
$fields += ['search_api_relevance','search_api_datasource','search_api_language','search_api_id','site_hash'];
$query->setOption('search_api_retrieved_field_values', $fields);
$query->setOption('highlight_reduce_return', ['*']);
// don't override highlight fields if any other SBF module/processor has already done this
if ($query->getOption('sbf_highlight_fields', NULL)) {
$query->setOption('sbf_highlight_fields', ['*']);
}
}
}
elseif (isset($query->getOptions()['search_api_view']) && !$query->getOptions()['search_api_view']->getDisplay()->usesFields()) {
$query->setOption('search_api_retrieved_field_values', ['search_api_relevance','search_api_datasource','search_api_language','search_api_id']);
$query->setOption('highlight_reduce_return', ['*']);
// See \Drupal\search_api_solr\Plugin\search_api\backend\SearchApiSolrBackend::getRequiredFields for a better list
// Sadly that method is protected so we will need to re-write it here?
$query->setOption('search_api_retrieved_field_values', ['search_api_relevance','search_api_datasource','search_api_language','search_api_id','site_hash']);
// don't override highlight fields if any other SBF module/processor has already done this
if ($query->getOption('sbf_highlight_fields', NULL)) {
$query->setOption('sbf_highlight_fields', ['*']);
}
}
}
}
48 changes: 28 additions & 20 deletions strawberryfield.module
Original file line number Diff line number Diff line change
Expand Up @@ -369,30 +369,31 @@ function strawberryfield_search_api_solr_query_alter(SolariumQueryInterface $sol

$solarium_query->addParam('hl.ocr.fl', $solr_field_names['ocr_text']);
$solarium_query->addParam('hl.ocr.absoluteHighlights', 'on');
$solarium_query->addParam('hl.method', 'UnifiedHighlighter');
// Only place where unified is justified
$hl->setMethod('unified');
}
}
elseif ($query->getOption('highlight_reduce_return', FALSE)) {
elseif ($query->getOption('sbf_highlight_fields', FALSE)) {
//advanced_highlight_return
// ELSEIF bc OCR and these ones are incompatible
/* @var \Solarium\Component\Highlighting\Highlighting $hl */
$hl = $solarium_query->getHighlighting();
$highlight_fields = $query->getOption('highlight_reduce_return',[]);
if ($highlight_fields != ['*']) {
$highlight_fields = array_filter(
$highlight_fields, function ($v) {
return preg_match('/^t.?[sm]_/', $v) || preg_match('/^s[sm]_/', $v);
}
);
}
else {
$hl->setRequireFieldMatch(TRUE);
}

$highlight_fields = $query->getOption('sbf_highlight_fields',[]);
foreach ($highlight_fields as $highlighted_field) {
// We must not set the fields at once using setFields() to not break
// the altered queries.
$hl->addField($highlighted_field);
$hl->addField($highlighted_field);
}

// Force HL to original for now. We can make this an option
// but given the Drupal nature of treating all Full Text fields as the same
// If a given Full text does not contain the vector index data required this will
// fail. Unified does not play with JOINs on Solr 9.1 throwing
// a class mismatch even if we are not asking for Highlights from the flavor.
// @TODO revisit for Solr 9.2.x
$hl->setUsePhraseHighlighter(TRUE);
$hl->setMethod('original');
$hl->setFragSize(128);
$hl->setRequireFieldMatch(TRUE);
}
}
Expand All @@ -405,19 +406,26 @@ function strawberryfield_search_api_solr_converted_query_alter(SolariumQueryInte
ComponentAwareQueryInterface::COMPONENT_EDISMAX
);
$solarium_query->addParam('defType', 'lucene');
/* @var \Solarium\Component\Highlighting\Highlighting $hl */
$hl = $solarium_query->getHighlighting();

$hl->setUsePhraseHighlighter(TRUE);
$hl->setDefaultSummary(TRUE);
$hl->setMethod('original');
$hl->setRequireFieldMatch(TRUE);
//$hl->setMethod('unified'); This requires omitTermFreqAndPositions=FALSE;
// @TODO make this a setting? 128 feels like a decent number TBH.
$hl->setFragsizeIsMinimum(FALSE);
$hl->setMergeContiguous(TRUE);
$hl->setFragSize(128);
$hl->setDefaultSummary(TRUE);
if ($combined_keys = $query->getOption('sbf_join_flavor')['hl'] ?? NULL) {
$hl->setQuery($combined_keys);
}

// Because the Query Sets a few Fields to retrieve (to make it faster)
// But Search API is silly and decides that when that happens
// I want only those fields highlighted
// By setting the to all but limiting it to setRequireFieldMatch we only the matched ones.
// This fails with JOINS and unified so we set method original.
$hl->setFields(['*']);
$hl->setFragsizeIsMinimum(FALSE);

}
}
}
Expand Down