From ff7650ee99a3e4dadc807fc9b8dfc97ee87c2d54 Mon Sep 17 00:00:00 2001 From: HADDAD Zineddine Date: Tue, 5 Sep 2023 09:14:58 +0200 Subject: [PATCH] Feature: Add support of multilingual search (#40) * update get_term_search_query to support multilanguages search * rename var * fix search lang suffix to use underscore not @ * add multilangual search test --------- Co-authored-by: Syphax Bouazzouni --- Gemfile.lock | 2 +- helpers/search_helper.rb | 15 +++++--- test/controllers/test_search_controller.rb | 44 ++++++++++++++++++++++ test/data/ontology_files/BRO_v3.2.owl | 3 ++ test/solr/docker-compose.yml | 13 +++++++ test/solr/generate_ncbo_configsets.sh | 35 +++++++++-------- 6 files changed, 90 insertions(+), 22 deletions(-) create mode 100644 test/solr/docker-compose.yml diff --git a/Gemfile.lock b/Gemfile.lock index e4bda0b3..44537959 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -248,7 +248,7 @@ GEM rack (>= 0.4) rack-attack (6.6.1) rack (>= 1.0, < 3) - rack-cache (1.13.0) + rack-cache (1.14.0) rack (>= 0.4) rack-cors (1.0.6) rack (>= 1.6.0) diff --git a/helpers/search_helper.rb b/helpers/search_helper.rb index 5d37d884..071000d9 100644 --- a/helpers/search_helper.rb +++ b/helpers/search_helper.rb @@ -82,6 +82,9 @@ def get_term_search_query(text, params={}) end end + lang = params["lang"] || params["language"] + lang_suffix = lang && !lang.eql?("all") ? "_#{lang}" : "" + query = "" params["defType"] = "edismax" params["stopwords"] = "true" @@ -98,15 +101,15 @@ def get_term_search_query(text, params={}) if params[EXACT_MATCH_PARAM] == "true" query = "\"#{solr_escape(text)}\"" - params["qf"] = "resource_id^20 prefLabelExact^10 synonymExact #{QUERYLESS_FIELDS_STR}" - params["hl.fl"] = "resource_id prefLabelExact synonymExact #{QUERYLESS_FIELDS_STR}" + params["qf"] = "resource_id^20 prefLabelExact#{lang_suffix }^10 synonymExact#{lang_suffix } #{QUERYLESS_FIELDS_STR}" + params["hl.fl"] = "resource_id prefLabelExact#{lang_suffix } synonymExact#{lang_suffix } #{QUERYLESS_FIELDS_STR}" elsif params[SUGGEST_PARAM] == "true" || text[-1] == '*' text.gsub!(/\*+$/, '') query = "\"#{solr_escape(text)}\"" params["qt"] = "/suggest_ncbo" - params["qf"] = "prefLabelExact^100 prefLabelSuggestEdge^50 synonymSuggestEdge^10 prefLabelSuggestNgram synonymSuggestNgram resource_id #{QUERYLESS_FIELDS_STR}" + params["qf"] = "prefLabelExact#{lang_suffix }^100 prefLabelSuggestEdge^50 synonymSuggestEdge^10 prefLabelSuggestNgram synonymSuggestNgram resource_id #{QUERYLESS_FIELDS_STR}" params["pf"] = "prefLabelSuggest^50" - params["hl.fl"] = "prefLabelExact prefLabelSuggestEdge synonymSuggestEdge prefLabelSuggestNgram synonymSuggestNgram resource_id #{QUERYLESS_FIELDS_STR}" + params["hl.fl"] = "prefLabelExact#{lang_suffix } prefLabelSuggestEdge synonymSuggestEdge prefLabelSuggestNgram synonymSuggestNgram resource_id #{QUERYLESS_FIELDS_STR}" else if text.strip.empty? query = '*' @@ -114,9 +117,9 @@ def get_term_search_query(text, params={}) query = solr_escape(text) end - params["qf"] = "resource_id^100 prefLabelExact^90 prefLabel^70 synonymExact^50 synonym^10 #{QUERYLESS_FIELDS_STR}" + params["qf"] = "resource_id^100 prefLabelExact#{lang_suffix }^90 prefLabel#{lang_suffix }^70 synonymExact#{lang_suffix }^50 synonym#{lang_suffix }^10 #{QUERYLESS_FIELDS_STR}" params["qf"] << " property" if params[INCLUDE_PROPERTIES_PARAM] == "true" - params["hl.fl"] = "resource_id prefLabelExact prefLabel synonymExact synonym #{QUERYLESS_FIELDS_STR}" + params["hl.fl"] = "resource_id prefLabelExact#{lang_suffix } prefLabel#{lang_suffix } synonymExact#{lang_suffix } synonym#{lang_suffix } #{QUERYLESS_FIELDS_STR}" params["hl.fl"] = "#{params["hl.fl"]} property" if params[INCLUDE_PROPERTIES_PARAM] == "true" end diff --git a/test/controllers/test_search_controller.rb b/test/controllers/test_search_controller.rb index 21a3dd18..74be75d2 100644 --- a/test/controllers/test_search_controller.rb +++ b/test/controllers/test_search_controller.rb @@ -213,4 +213,48 @@ def test_search_provisional_class assert_equal @@test_pc_child.label, provisional[0]["prefLabel"].first end + def test_multilingual_search + get "/search?q=Activity&ontologies=BROSEARCHTEST-0" + res = MultiJson.load(last_response.body) + refute_equal 0, res["totalCount"] + + doc = res["collection"].select{|doc| doc["@id"].to_s.eql?('http://bioontology.org/ontologies/Activity.owl#Activity')}.first + refute_nil doc + + #res = LinkedData::Models::Class.search("prefLabel_none:Activity", {:fq => "submissionAcronym:BROSEARCHTEST-0", :start => 0, :rows => 80}, :main) + #refute_equal 0, res["response"]["numFound"] + #refute_nil res["response"]["docs"].select{|doc| doc["resource_id"].eql?('http://bioontology.org/ontologies/Activity.owl#Activity')}.first + + get "/search?q=Activit%C3%A9&ontologies=BROSEARCHTEST-0&lang=fr" + res = MultiJson.load(last_response.body) + refute_equal 0, res["totalCount"] + refute_nil res["collection"].select{|doc| doc["@id"].eql?('http://bioontology.org/ontologies/Activity.owl#Activity')}.first + + + + get "/search?q=ActivityEnglish&ontologies=BROSEARCHTEST-0&lang=en" + res = MultiJson.load(last_response.body) + refute_equal 0, res["totalCount"] + refute_nil res["collection"].select{|doc| doc["@id"].eql?('http://bioontology.org/ontologies/Activity.owl#Activity')}.first + + + get "/search?q=ActivityEnglish&ontologies=BROSEARCHTEST-0&lang=fr&require_exact_match=true" + res = MultiJson.load(last_response.body) + assert_nil res["collection"].select{|doc| doc["@id"].eql?('http://bioontology.org/ontologies/Activity.owl#Activity')}.first + + get "/search?q=ActivityEnglish&ontologies=BROSEARCHTEST-0&lang=en&require_exact_match=true" + res = MultiJson.load(last_response.body) + refute_nil res["collection"].select{|doc| doc["@id"].eql?('http://bioontology.org/ontologies/Activity.owl#Activity')}.first + + get "/search?q=Activity&ontologies=BROSEARCHTEST-0&lang=en&require_exact_match=true" + res = MultiJson.load(last_response.body) + assert_nil res["collection"].select{|doc| doc["@id"].eql?('http://bioontology.org/ontologies/Activity.owl#Activity')}.first + + get "/search?q=Activit%C3%A9&ontologies=BROSEARCHTEST-0&lang=fr&require_exact_match=true" + res = MultiJson.load(last_response.body) + refute_nil res["collection"].select{|doc| doc["@id"].eql?('http://bioontology.org/ontologies/Activity.owl#Activity')}.first + + + end + end diff --git a/test/data/ontology_files/BRO_v3.2.owl b/test/data/ontology_files/BRO_v3.2.owl index d64075cc..b2aeccf5 100644 --- a/test/data/ontology_files/BRO_v3.2.owl +++ b/test/data/ontology_files/BRO_v3.2.owl @@ -631,6 +631,9 @@ Activity + Activity + ActivityEnglish + Activité Activity of interest that may be related to a BRO:Resource. activities diff --git a/test/solr/docker-compose.yml b/test/solr/docker-compose.yml new file mode 100644 index 00000000..3ddae69c --- /dev/null +++ b/test/solr/docker-compose.yml @@ -0,0 +1,13 @@ +version: '3.8' + +services: + op_solr: + image: solr:8.8 + volumes: + - ./solr_configsets:/configsets:ro + ports: + - "8983:8983" + command: > + bash -c "precreate-core term_search_core1 /configsets/term_search + && precreate-core prop_search_core1 /configsets/property_search + && solr-foreground" diff --git a/test/solr/generate_ncbo_configsets.sh b/test/solr/generate_ncbo_configsets.sh index 893f7f3a..7b4281f7 100755 --- a/test/solr/generate_ncbo_configsets.sh +++ b/test/solr/generate_ncbo_configsets.sh @@ -2,18 +2,23 @@ # generates solr configsets by merging _default configset with config files in config/solr # _default is copied from sorl distribuion solr-8.10.1/server/solr/configsets/_default/ -pushd solr/configsets -ld_config='../../../../ontologies_linked_data/config/solr/' -#ld_config='../../../../config/solr/' -ls -l $ld_config -pwd -[ -d property_search ] && rm -Rf property_search -[ -d term_search ] && rm -Rf property_search -[ -d $ld_config/property_search ] || echo "cant find ontologies_linked_data project" -mkdir -p property_search/conf -mkdir -p term_search/conf -cp -a _default/conf/* property_search/conf/ -cp -a _default/conf/* term_search/conf/ -cp -a $ld_config/property_search/* property_search/conf -cp -a $ld_config/term_search/* term_search/conf -popd +#cd solr/configsets +ld_config='config/solr' +configsets='test/solr/configsets' +[ -d ${configsets}/property_search ] && rm -Rf ${configsets}/property_search +[ -d ${configsets}/term_search ] && rm -Rf ${configsets}/term_search +if [[ ! -d ${ld_config}/property_search ]]; then + echo 'cant find ld solr config sets' + exit 1 +fi +if [[ ! -d ${configsets}/_default/conf ]]; then + echo 'cant find default solr configset' + exit 1 +fi +mkdir -p ${configsets}/property_search/conf +mkdir -p ${configsets}/term_search/conf +cp -a ${configsets}/_default/conf/* ${configsets}/property_search/conf/ +cp -a ${configsets}/_default/conf/* ${configsets}/term_search/conf/ +cp -a $ld_config/property_search/* ${configsets}/property_search/conf +cp -a $ld_config/term_search/* ${configsets}/term_search/conf +