Skip to content

Commit

Permalink
moved functions to somewhere else (#78)
Browse files Browse the repository at this point in the history
* moved functions to somewhere else

* oeps

* black and lint

---------

Co-authored-by: Lars van de Kerkhof <[email protected]>
  • Loading branch information
viggo-devries and specialunderwear authored Dec 4, 2024
1 parent 4b46d83 commit 44a99c2
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 113 deletions.
8 changes: 6 additions & 2 deletions oscar_elasticsearch/search/indexing/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ def bulk_index(self, documents, current_alias=None):
bulk(es, docs, ignore=[400])

def get_current_alias(self):
aliasses = list(es.indices.get_alias(name=self.name).keys())
aliasses = list(
es.indices.get_alias(name=self.name, ignore_unavailable=True).keys()
)
if aliasses:
return aliasses[0]

Expand All @@ -64,7 +66,9 @@ def finish(self):
# Check if alias exists for indice
if es.indices.exists_alias(name=self.name):
# Get alisases
aliased_indices = es.indices.get_alias(name=self.name).keys()
aliased_indices = es.indices.get_alias(
name=self.name, ignore_unavailable=True
).keys()

# Link the new alias to the old indice
es.indices.put_alias(name=self.name, index=self.alias_name)
Expand Down
113 changes: 2 additions & 111 deletions oscar_elasticsearch/search/indexing/settings.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from oscar_elasticsearch.search.utils import get_index_settings
from oscar_elasticsearch.search.settings import (
INDEX_PREFIX,
FACETS,
Expand All @@ -8,117 +9,7 @@


def get_oscar_index_settings():
return {
"analysis": {
"analyzer": {
# the simplest analyzer most useful for normalizing and splitting a sentence into words
# this is most likely only used as a search analyzer
"lowercasewhitespace": {
"tokenizer": "whitespace",
"filter": ["lowercase", "asciifolding"],
"char_filter": ["non_ascii_character_filter_mapping"],
},
# this analyzer will keep all punctuation and numbers and make ngrams
# as small as a single character. Only usefull for upcs and techincal terms
"technical_analyzer": {
"tokenizer": "whitespace",
"filter": [
"shallow_edgengram",
"lowercase",
"asciifolding",
"max_gram_truncate",
],
"char_filter": ["non_ascii_character_filter_mapping"],
},
# should be used as the search analyzer for terms analyzed with the
# technical_analyzer. Will just split the input into words and normalize
# but keeping in mind the max ngram size.
"technical_search_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"max_gram_truncate",
],
"char_filter": ["non_ascii_character_filter_mapping"],
},
# this analyzer is usefull for important textual data like titles,
# that contain a lot of search terms.
"title_analyzer": {
"tokenizer": "standard",
"filter": [
"edgengram",
"lowercase",
"asciifolding",
"max_gram_truncate",
],
},
# should be used as the search analyzer for terms analyzed with title_analyzer
"reversed_title_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"reversed_edgengram",
"max_gram_truncate",
],
},
# this analyzer is most usefull for long textual data. punctuation and numbers
# WILL BE STRIPPED
"standard": {
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding"],
},
# This analyzer is usefull for when you need to find really specific data inside some text,
# for example you have a 'Volvo Penta TAD163532E' code inside your model type and you want it to be found with 'Penta D16'
# Also use the 'technical_search_analyzer' for this one.
"technical_title_analyzer": {
"tokenizer": "whitespace",
"filter": [
"ngram",
"lowercase",
"asciifolding",
"max_gram_truncate",
],
},
},
"tokenizer": {
"ngram_tokenizer": {"type": "ngram", "min_gram": 3, "max_gram": 15},
"edgengram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": MAX_GRAM,
},
},
"filter": {
"ngram": {"type": "ngram", "min_gram": 3, "max_gram": MAX_GRAM},
"edgengram": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": MAX_GRAM,
},
"shallow_edgengram": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": MAX_GRAM,
},
"reversed_edgengram": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": MAX_GRAM,
"side": "back",
},
"max_gram_truncate": {"type": "truncate", "length": MAX_GRAM},
},
"char_filter": {
"non_ascii_character_filter_mapping": {
"type": "mapping",
"mappings": ["’ => '"],
}
},
},
"index": {"number_of_shards": 1, "max_ngram_diff": MAX_GRAM},
}
return get_index_settings(MAX_GRAM)


OSCAR_INDEX_MAPPING = {
Expand Down
114 changes: 114 additions & 0 deletions oscar_elasticsearch/search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,117 @@ def search_result_to_queryset(search_results, Model):

preserved = Case(*[When(pk=pk, then=pos) for pos, pk in enumerate(instance_ids)])
return Model.objects.filter(pk__in=instance_ids).order_by(preserved)


def get_index_settings(MAX_GRAM):
return {
"analysis": {
"analyzer": {
# the simplest analyzer most useful for normalizing and splitting a sentence into words
# this is most likely only used as a search analyzer
"lowercasewhitespace": {
"tokenizer": "whitespace",
"filter": ["lowercase", "asciifolding"],
"char_filter": ["non_ascii_character_filter_mapping"],
},
# this analyzer will keep all punctuation and numbers and make ngrams
# as small as a single character. Only usefull for upcs and techincal terms
"technical_analyzer": {
"tokenizer": "whitespace",
"filter": [
"shallow_edgengram",
"lowercase",
"asciifolding",
"max_gram_truncate",
],
"char_filter": ["non_ascii_character_filter_mapping"],
},
# should be used as the search analyzer for terms analyzed with the
# technical_analyzer. Will just split the input into words and normalize
# but keeping in mind the max ngram size.
"technical_search_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"max_gram_truncate",
],
"char_filter": ["non_ascii_character_filter_mapping"],
},
# this analyzer is usefull for important textual data like titles,
# that contain a lot of search terms.
"title_analyzer": {
"tokenizer": "standard",
"filter": [
"edgengram",
"lowercase",
"asciifolding",
"max_gram_truncate",
],
},
# should be used as the search analyzer for terms analyzed with title_analyzer
"reversed_title_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"reversed_edgengram",
"max_gram_truncate",
],
},
# this analyzer is most usefull for long textual data. punctuation and numbers
# WILL BE STRIPPED
"standard": {
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding"],
},
# This analyzer is usefull for when you need to find really specific data inside some text,
# for example you have a 'Volvo Penta TAD163532E' code inside your model type and you want it to be found with 'Penta D16'
# Also use the 'technical_search_analyzer' for this one.
"technical_title_analyzer": {
"tokenizer": "whitespace",
"filter": [
"ngram",
"lowercase",
"asciifolding",
"max_gram_truncate",
],
},
},
"tokenizer": {
"ngram_tokenizer": {"type": "ngram", "min_gram": 3, "max_gram": 15},
"edgengram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": MAX_GRAM,
},
},
"filter": {
"ngram": {"type": "ngram", "min_gram": 3, "max_gram": MAX_GRAM},
"edgengram": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": MAX_GRAM,
},
"shallow_edgengram": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": MAX_GRAM,
},
"reversed_edgengram": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": MAX_GRAM,
"side": "back",
},
"max_gram_truncate": {"type": "truncate", "length": MAX_GRAM},
},
"char_filter": {
"non_ascii_character_filter_mapping": {
"type": "mapping",
"mappings": ["’ => '"],
}
},
},
"index": {"number_of_shards": 1, "max_ngram_diff": MAX_GRAM},
}

0 comments on commit 44a99c2

Please sign in to comment.