Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into issue-182
Browse files Browse the repository at this point in the history
  • Loading branch information
elvesmrodrigues committed Sep 20, 2022
2 parents ad20e60 + 4f9e43f commit b6a26e3
Show file tree
Hide file tree
Showing 146 changed files with 25,920 additions and 841 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Há duas formas de se instanciar o projeto, sendo elas a versão automática ou
> pip install --use-deprecated=legacy-resolver -r requirements.txt
4. Navegue até a pasta search_engine/mpmg e faça uma cópia do arquivo "settings.template.py" com o nome de "settings.py". Altere alguns diretórios e senhas caso necessário.
5. Crie um usuário para acessar a interface da API. Navegue até o diretório search_engine e rode:
5. Crie um usuário para acessar a interface da API. Será necessário informar qual tipo de cliente o nome usuário será, por meio de `api_client_name`. Digite `procon` ou `gsi`, de acordo com suas necessidades. Navegue até o diretório search_engine e rode:
> python manage.py createsuperuser
## Execução
Expand Down
114 changes: 114 additions & 0 deletions feedback/candidates_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import requests
import random
import json
from tqdm import tqdm
import requests

def format_text(text):
import re
text = re.sub('\n+', '\n', text)
return text.replace("\n", " ")

def apply_similarity(method):
print(requests.post("http://localhost:9200/_all/_close?wait_for_active_shards=0&pretty").json())
method = "BM25" if method == "dense" or method == "bm25sparse" else "LMDirichlet"
data = json.dumps({"index": {"similarity": {"default": {"type": f"{method}"}}}})
print(requests.put("http://localhost:9200/_all/_settings?pretty", headers={"Content-Type": "application/json"}, data=data).json())
print(requests.post("http://localhost:9200/_all/_open?pretty").json())


if __name__ == "__main__":
with open("./data/generated_queries.txt", "r") as queries_file:
queries = eval(queries_file.read())
random.shuffle(queries)

## Submete consultas
for strategy in ["dense", "bm25sparse", "dirisparse"]:
apply_similarity(strategy)
feedback_data = []
for q in tqdm(queries):
try:
query = q["query"].replace("/", "")
params = {'consulta': query, 'page': 1, 'sid': '123'}
service_response = requests.get('http://localhost:8000/services/search', params)
results = json.loads(service_response.text)
feedback_line = {}
feedback_line["text"] = query
feedback_line["_id"] = q["id"]
feedback_line["corresponding"] = []
# seleciona os 2 primeiros de cada estratégia
results = sorted(json.loads(service_response.text)["documentos"], key=lambda x: x["posicao_ranking"])
if not len(results):
print("e", len(results))
for row in results:
feedback_line["corresponding"].append({"text": row["conteudo"].encode('utf-8').decode("utf-8"), "_id": row["id"]})
feedback_data.append(feedback_line)
except Exception as err:
print(err)
continue

with open(f"./data/feedback_queries_{strategy}.json", "w") as output:
output.write(json.dumps(feedback_data, ensure_ascii=False))

## Merge rankings
with open(f"./data/feedback_queries_dense.json", "r") as input_file:
feedback_data_dense = eval(input_file.read())
with open(f"./data/feedback_queries_dirisparse.json", "r") as input_file:
feedback_data_dirisparse = eval(input_file.read())
with open(f"./data/feedback_queries_bm25sparse.json", "r") as input_file:
feedback_data_bm25sparse = eval(input_file.read())

feedback_data_dense_dict = {}
for line in feedback_data_dense:
feedback_data_dense_dict[line["text"]] = line

feedback_data_dirisparse_dict = {}
for line in feedback_data_dirisparse:
feedback_data_dirisparse_dict[line["text"]] = line

feedback_data_bm25sparse_dict = {}
for line in feedback_data_bm25sparse:
feedback_data_bm25sparse_dict[line["text"]] = line

feedback_data_final = []
for q in tqdm(queries):
try:
query = q["query"].replace("/", "")

feedback_line = {}
feedback_line["text"] = "CONSULTA: " + query
feedback_line["query"] = query
feedback_line["_id"] = q["id"]
feedback_line["corresponding"] = []
counter = 0
included = []
max_candidate = 0

for origin, feedback_list in zip(["dense", "dirisparse", "bm25sparse"], [feedback_data_dense_dict, feedback_data_dirisparse_dict, feedback_data_bm25sparse_dict]):
max_candidate+= len(feedback_list[query]["corresponding"])
if max_candidate < 6:
continue

#combina os corresponding de cada estratégia
while len(feedback_line["corresponding"]) < 6:
counter = 0
for origin, feedback_list in zip(["dense", "dirisparse", "bm25sparse"], [feedback_data_dense_dict, feedback_data_dirisparse_dict, feedback_data_bm25sparse_dict]):
for entry in feedback_list[query]["corresponding"]:
# incluir a posição do elemento naquele ranking de origem
if query + "--" + entry["_id"] in included:
continue
if counter == 2:
counter = 0
break
entry["text"] = format_text(entry["text"])
entry["origin"] = origin
feedback_line["corresponding"].append(entry)
counter+= 1
included.append(query + "--" + entry["_id"])
feedback_data_final.append(feedback_line)
except Exception as err:
print(err)
continue

with open(f"./data/feedback_queries_final.json", "w") as output:
output.write(json.dumps(feedback_data_final, ensure_ascii=False))
65 changes: 65 additions & 0 deletions feedback/candidates_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import json
from copy import deepcopy
import json
from tqdm import tqdm
from nltk.corpus import stopwords
stopwords = stopwords.words("portuguese")


if __name__ == "__main__":
REVIEWERS_NUMBER = 4
ENTRIES_PER_REVIEWER = 2
#distribuição e coloração das entradas
with open(f"./data/feedback_queries_final.json", "r") as input_file:
feedback_data_final = eval(input_file.read())

feedback = {f"reviewer_{idx}": [] for idx in range(REVIEWERS_NUMBER)}

for idx, entry in enumerate(feedback_data_final):
#revisar essa lógica
value = idx % ENTRIES_PER_REVIEWER
feedback[f"reviewer_{value}"].append(entry)
feedback[f"reviewer_{value + ENTRIES_PER_REVIEWER}"].append(entry)

# coloring
with open("./data/generated_queries.txt", "r") as queries_file:
queries = eval(queries_file.read())

# styles
style_list = [
'style="color:white;background-color:gray;"',
'style="color:white;background-color:blue;"',
'style="color:white;background-color:green;"',
'style="color:black;background-color:yellow;"',
'style="color:white;background-color:brown;"',
'style="color:black;background-color:orange;"',
'style="color:white;background-color:red;"',
'style="color:black;background-color:pink;"',
'style="color:white;background-color:purple;"',
]

for person, data_list in feedback.items():
for entry in data_list:
new_corresponding = ""
query_tokens = entry["query"].split()
query_tokens = [token for token in query_tokens if not token in stopwords]
color_codes = dict(zip(query_tokens, style_list[:len(query_tokens)]))
for corres in entry["corresponding"]:
for token in corres["text"].split():
if token in query_tokens:
new_corresponding += f"<mark {color_codes[token]}>" + token + "</mark> "
else:
new_corresponding += token + " "
corres["formatted_text"] = new_corresponding

entry["formatted_text"] = "<br>"
for token in entry["query"].split():
if token in query_tokens:
entry["formatted_text"] += f"<mark {color_codes[token]}>" + token + "</mark> "
else:
entry["formatted_text"] += token + " "

#saving
for person, data_list in feedback.items():
with open(f"./data/feedback_{person}.json", "w") as output:
output.write(json.dumps(data_list, ensure_ascii=False))
Loading

0 comments on commit b6a26e3

Please sign in to comment.