Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Elasticsearch search indice update #1796

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ jobs:
run: |
mkdir /tmp/elasticsearch
wget -O - https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/distribution/tar/elasticsearch/2.4.6/elasticsearch-2.4.6.tar.gz | tar xz --directory=/tmp/elasticsearch --strip-components=1
/tmp/elasticsearch/bin/plugin install analysis-icu
/tmp/elasticsearch/bin/elasticsearch --daemonize --path.data /tmp
sleep 30 # ElasticSearch takes few seconds to start, make sure it is available when the build script runs
- run: curl -v http://localhost:9200
Expand Down
52 changes: 36 additions & 16 deletions c2corg_api/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,26 +83,46 @@ def create_search(document_type):

def get_text_query_on_title(search_term, search_lang=None):
fields = []
# search in all title* (title_en, title_fr, ...) fields.

if search_term.count(' ') == 0:
mots = False
else:
mots = True

if not search_lang:
fields.append('title_*.ngram')
fields.append('title_*.raw^2')
if not mots:
fields.append('title_*.ngram')
else:
fields.append('title_*.contentheavy')

else:
# if a language is given, boost the fields for the language
for lang in default_langs:
if lang == search_lang:
fields.append('title_{0}.ngram^2'.format(lang))
fields.append('title_{0}.raw^3'.format(lang))
else:
fields.append('title_{0}.ngram'.format(lang))
fields.append('title_{0}.raw^2'.format(lang))

return MultiMatch(
query=search_term,
fuzziness='auto',
operator='and',
fields=fields
)
if not mots:
fields.append('title_{0}.ngram'.format(lang))
else:
fields.append('title_{0}.contentheavy'.format(lang))

if not mots:
return MultiMatch(
query=search_term,
fields=fields,
type='best_fields',
fuzziness=1,
max_expansions=2,
zero_terms_query="none",
slop=4,
)
else:
return MultiMatch(
query=search_term,
fields=fields,
type='phrase',
fuzziness=2,
max_expansions=3,
zero_terms_query="none",
slop=4,
)


search_documents = {
Expand Down
181 changes: 161 additions & 20 deletions c2corg_api/search/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,29 @@ class BaseMeta:
# the configuration is based on the one by Photon:
# https://github.com/komoot/photon/blob/master/es/mappings.json
# https://github.com/komoot/photon/blob/master/es/index_settings.json
def default_title_field():
return String(
index='not_analyzed',
similarity='c2corgsimilarity',
fields={
'ngram': String(
analyzer='index_ngram', search_analyzer='search_ngram'),
'raw': String(
analyzer='index_raw', search_analyzer='search_raw')})
def default_title_field(lang: None):
if lang is None:
return String(
index='not_analyzed',
similarity='c2corgsimilarity',
fields={
'ngram': String(
analyzer='index_ngram', search_analyzer='search_ngram'),
'raw': String(
analyzer='index_raw', search_analyzer='search_raw')
})
else:
return String(
index='not_analyzed',
similarity='c2corgsimilarity',
fields={
'ngram': String(
analyzer='index_ngram', search_analyzer='search_ngram'),
'raw': String(
analyzer='index_raw', search_analyzer='search_raw'),
'contentheavy': String(
analyzer='{0}_heavy'.format(lang))
})


class SearchDocument(DocType):
Expand Down Expand Up @@ -59,54 +73,68 @@ class Meta(BaseMeta):
areas = QLong('a', is_id=True)

# fr
title_fr = default_title_field()
title_fr = default_title_field("french")
summary_fr = String(
analyzer='index_french', search_analyzer='search_french')
description_fr = String(
analyzer='index_french', search_analyzer='search_french')

# it
title_it = default_title_field()
title_it = default_title_field("italian")
summary_it = String(
analyzer='index_italian', search_analyzer='search_italian')
description_it = String(
analyzer='index_italian', search_analyzer='search_italian')

# de
title_de = default_title_field()
title_de = default_title_field("german")
summary_de = String(
analyzer='index_german', search_analyzer='search_german')
description_de = String(
analyzer='index_german', search_analyzer='search_german')

# en
title_en = default_title_field()
title_en = default_title_field("english")
summary_en = String(
analyzer='index_english', search_analyzer='search_english')
description_en = String(
analyzer='index_english', search_analyzer='search_english')

# es
title_es = default_title_field()
title_es = default_title_field("spanish")
summary_es = String(
analyzer='index_spanish', search_analyzer='search_spanish')
description_es = String(
analyzer='index_spanish', search_analyzer='search_spanish')

# ca
title_ca = default_title_field()
title_ca = default_title_field("catalan")
summary_ca = String(
analyzer='index_catalan', search_analyzer='search_catalan')
description_ca = String(
analyzer='index_catalan', search_analyzer='search_catalan')

# eu
title_eu = default_title_field()
title_eu = default_title_field("basque")
summary_eu = String(
analyzer='index_basque', search_analyzer='search_basque')
description_eu = String(
analyzer='index_basque', search_analyzer='search_basque')

# sl
title_sl = default_title_field("slovene")
summary_sl = String(
analyzer='index_slovene', search_analyzer='search_slovene')
description_sl = String(
analyzer='index_slovene', search_analyzer='search_slovene')

# zh
title_zh = default_title_field("chinois")
summary_zh = String(
analyzer='index_chinois', search_analyzer='search_chinois')
description_zh = String(
analyzer='index_chinois', search_analyzer='search_chinois')

@staticmethod
def to_search_document(document, index, include_areas=True):
search_document = {
Expand Down Expand Up @@ -190,8 +218,12 @@ def copy_enum_range_fields(
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 20
"min_gram": "2",
"max_gram": "15",
"token_chars": [
"letter",
"digit"
]
},
# filters for the english analyzers
"english_stop": {
Expand Down Expand Up @@ -300,9 +332,9 @@ def copy_enum_range_fields(
"index_ngram": {
"char_filter": ["punctuationgreedy"],
"filter": [
"word_delimiter", "lowercase", "asciifolding", "unique",
"word_delimiter", "lowercase", "icu_folding",
"autocomplete_filter"],
"tokenizer": "standard"
"tokenizer": "icu_tokenizer"
},
"search_ngram": {
"char_filter": ["punctuationgreedy"],
Expand Down Expand Up @@ -453,6 +485,115 @@ def copy_enum_range_fields(
"basque_stop",
"basque_stemmer"
]
},
"french_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"french_elision",
"french_stop",
"icu_folding",
"lowercase",
"french_stemmer"
]
},
"german_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"german_stop",
"german_stemmer",
"lowercase",
"icu_folding"
]
},
"english_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"english_possessive_stemmer",
"english_stop",
"lowercase",
"icu_folding"
]
},
"italian_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"italian_elision",
"italian_stop",
"lowercase",
"icu_folding",
"italian_stemmer"
]
},
"spanish_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"lowercase",
"spanish_stop",
"spanish_stemmer",
"icu_folding"
]
},
"catalan_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"catalan_elision",
"lowercase",
"catalan_stop",
"catalan_stemmer",
"icu_folding"
]
},
"basque_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"lowercase",
"basque_stop",
"basque_stemmer",
"icu_folding"
]
},
"index_slovene": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"autocomplete_filter",
"lowercase"
]
},
"search_slovene": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"slovene_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"lowercase",
"icu_folding"
]
},
"index_chinois": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"autocomplete_filter"
]
},
"search_chinois": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"chinois_heavy": {
"tokenizer": "icu_tokenizer",
"filter": [
"lowercase",
"icu_folding"
]
}
}
}
34 changes: 34 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# ElasticSearch 2.4 update for search improvement

Script has to be execute one time in order to be relevant with the code update.

## Script has to be execute from the host

### script requirement :
- it requires curl from the execution machine

- it will update elasticsearch on localhost:9200, if elasticsearch is on other server, please update script changing localhost by the correct ES url.
- an elasticsearch plugin has to be install, in case of elasticsearch cluster - the plugin has to be install on each node


To install manually the the plugin analysis-icu from elasticsearch server :

```
./bin/plugin install analysis-icu analysis-icu
```


The installation process is :
- Install the plugin,
- restart ElasticSearch instance,
- update analyser (_settings)
- update document mapping (_mappings /espon)
- start a full indexation.

To launch the script from the host side :
```
cd scripts/
./esUpdateMapSettings.sh
```


Loading