Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added evaluation script #14

Open
wants to merge 41 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
77c1586
add gerdalir dataset
guenthermi Oct 31, 2023
bd5e945
fix: gerdalir dataset
guenthermi Oct 31, 2023
3eb5c71
fix: lang from en to de
guenthermi Oct 31, 2023
2d00b58
feat: add wikiclir en-de task
violenil Nov 1, 2023
830cd60
chore: add ir datasets to requirements
violenil Nov 1, 2023
6730483
refactor: limit queries to 10k
violenil Nov 1, 2023
11866e6
refactor: update description of task with limit
violenil Nov 2, 2023
cd955ec
Merge pull request #2 from jina-ai/feat-wikiclir-de
violenil Nov 2, 2023
151668c
solve merge conflict
guenthermi Nov 3, 2023
2b9c20f
revert style changes
guenthermi Nov 3, 2023
262ab27
Merge pull request #1 from jina-ai/add-gerdalir-dataset
guenthermi Nov 3, 2023
1a9278f
add XMarket dataset
guenthermi Nov 3, 2023
1f0c797
add xmarket to init file
guenthermi Nov 3, 2023
5818581
feat: add german stsbenchmarksts task
violenil Nov 6, 2023
dacf64a
feat: add revision id
guenthermi Nov 6, 2023
5049d05
feat: update revision id
violenil Nov 6, 2023
14448de
refactor: update revision id after changes in scores
violenil Nov 6, 2023
7e05fc0
Merge pull request #4 from jina-ai/feat-add-sts-de
violenil Nov 6, 2023
628fbb1
add paws x dataset
guenthermi Nov 6, 2023
b360db4
Merge pull request #3 from jina-ai/add-xmarket-de
guenthermi Nov 7, 2023
cbd8d95
add GermanDPR dataset
guenthermi Nov 7, 2023
7316147
fix loading
guenthermi Nov 7, 2023
b8e5afc
solve merge conflict
guenthermi Nov 7, 2023
30a9611
Update mteb/tasks/Retrieval/GermanDPRRetrieval.py
guenthermi Nov 7, 2023
3de1bb1
Merge pull request #5 from jina-ai/add-paws-x-dataset
guenthermi Nov 9, 2023
9e2ac3e
feat: add miracl reranking task for german
violenil Nov 9, 2023
60a61ac
refactor: cleanup task
violenil Nov 9, 2023
55155d6
Add ir_datasets as dependency
Markus28 Nov 9, 2023
969e202
Merge pull request #8 from jina-ai/add_ir_dependency
Markus28 Nov 9, 2023
c6733bb
Fix: Adding MTEB_SINGLE_GPU environment variable
Markus28 Nov 9, 2023
a6c9cb7
Merge pull request #9 from jina-ai/fix_multi_gpu
Markus28 Nov 9, 2023
cf2c380
fix: Use MTEB_SINGLE_GPU environment variable also in BeIRTask.py (#10)
Markus28 Nov 10, 2023
9aea329
Merge pull request #6 from jina-ai/add-german-dpr
guenthermi Nov 13, 2023
19f77dd
Merge pull request #7 from jina-ai/feat-add-miracl-reranking
violenil Nov 13, 2023
2822839
prevent duplicate pos docs
guenthermi Nov 13, 2023
76b7b45
fix: use test split in MIRACL (#13)
Markus28 Jan 11, 2024
0a4a7e5
feat: added evaluation script
Markus28 Jan 15, 2024
545746d
feat: renamed script, updated README
Markus28 Jan 15, 2024
5b9f053
feat: fixed phrasing in README
Markus28 Jan 15, 2024
3b9ff9d
feat: fixed phrasing in README
Markus28 Jan 15, 2024
64f03b5
feat: added back WikiCLIR
Markus28 Jan 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
308 changes: 17 additions & 291 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion mteb/abstasks/AbsTaskRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def evaluate(
corpus, queries, relevant_docs = self.corpus[split], self.queries[split], self.relevant_docs[split]
model = model if self.is_dres_compatible(model) else DRESModel(model)

if os.getenv("RANK", None) is None:
if os.getenv("RANK", None) is None or os.getenv("MTEB_SINGLE_GPU", "false").lower() == "true":
# Non-distributed
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
model = DRES(
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/BeIRTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def load_data(self, eval_splits=None, **kwargs):
USE_HF_DATASETS = False

# TODO @nouamane: move non-distributed to `HFDataLoader`
if os.getenv("RANK", None) is not None:
if os.getenv("RANK", None) is not None and os.getenv("MTEB_SINGLE_GPU", "false").lower() == "false":
if self.description["beir_name"].startswith("cqadupstack"):
raise ImportError("CQADupstack is incompatible with BEIR's HFDataLoader in a distributed setting")
from beir.datasets.data_loader_hf import HFDataLoader
Expand Down
51 changes: 51 additions & 0 deletions mteb/tasks/PairClassification/PawsX.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import datasets

from ...abstasks.AbsTaskPairClassification import AbsTaskPairClassification


class PawsX(AbsTaskPairClassification):
@property
def description(self):
return {
"name": "PawsX",
"hf_hub_name": "paws-x",
"description": "",
"reference": "",
"category": "s2s",
"type": "PairClassification",
"eval_splits": ["test"],
"eval_langs": ["de"],
"main_score": "ap",
"revision": "8a04d940a42cd40658986fdd8e3da561533a3646",
}

def load_data(self, **kwargs):
if self.data_loaded:
return

hf_dataset = datasets.load_dataset(
self.description["hf_hub_name"],
self.description.get('eval_langs', ['en'])[0],
revision=self.description.get("revision", None),
)

sent1 = []
sent2 = []
labels = []

for line in hf_dataset['test']:
sent1.append(line['sentence1'])
sent2.append(line['sentence2'])
labels.append(line['label'])

self.dataset = {
'test': [
{
'sent1': sent1,
'sent2': sent2,
'labels': labels,
}
]
}

self.data_loaded = True
1 change: 1 addition & 0 deletions mteb/tasks/PairClassification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .TwitterURLCorpusPC import *
from .CMTEBPairClassification import *
from .PolishPC import *
from .PawsX import *
22 changes: 22 additions & 0 deletions mteb/tasks/Reranking/MIRACLReranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from mteb.abstasks.AbsTaskReranking import AbsTaskReranking


class MIRACLReranking(AbsTaskReranking):
@property
def description(self):
return {
'name': 'MIRACL',
'hf_hub_name': 'jinaai/miracl',
'reference': 'https://project-miracl.github.io/',
'description': (
'MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual '
'retrieval dataset that focuses on search across 18 different languages. This task focuses on '
'the German subset, uing the dev set containing 305 queries.'
),
'type': 'Reranking',
'category': 's2p',
'eval_splits': ['test'],
'eval_langs': ['de'],
'main_score': 'map',
'revision': '8741c3b61cd36ed9ca1b3d4203543a41793239e2',
}
1 change: 1 addition & 0 deletions mteb/tasks/Reranking/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .SciDocsReranking import *
from .StackOverflowDupQuestions import *
from .CMTEBReranking import *
from .MIRACLReranking import *
40 changes: 40 additions & 0 deletions mteb/tasks/Retrieval/GerDaLIRRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRTask import BeIRTask

import datasets


class GerDaLIR(AbsTaskRetrieval):
_EVAL_SPLIT = 'test'

@property
def description(self):
return {
"name": "GerDaLIR",
"hf_hub_name": "jinaai/ger_da_lir",
"description": (
"GerDaLIR is a legal information retrieval dataset created from the Open Legal Data platform."
),
"reference": "https://github.com/lavis-nlp/GerDaLIR",
"type": "Retrieval",
"category": "s2p",
"eval_splits": ["test"],
"eval_langs": ["de"],
"main_score": "ndcg_at_10",
}

def load_data(self, **kwargs):
if self.data_loaded:
return

query_rows = datasets.load_dataset(self.description["hf_hub_name"], "queries", split=self._EVAL_SPLIT)
corpus_rows = datasets.load_dataset(self.description["hf_hub_name"], "corpus", split=self._EVAL_SPLIT)
qrels_rows = datasets.load_dataset(self.description["hf_hub_name"], "qrels", split=self._EVAL_SPLIT)

self.queries = {self._EVAL_SPLIT: {row["_id"]: row["text"] for row in query_rows}}
self.corpus = {self._EVAL_SPLIT: {row["_id"]: row for row in corpus_rows}}
self.relevant_docs = {
self._EVAL_SPLIT: {row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows}
}

self.data_loaded = True
67 changes: 67 additions & 0 deletions mteb/tasks/Retrieval/GermanDPRRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRTask import BeIRTask

import datasets


class GermanDPR(AbsTaskRetrieval):
_EVAL_SPLIT = "test"
_LANGUAGE = "de"

@property
def description(self):
return {
"name": "GermanDPR",
"hf_hub_name": "deepset/germandpr",
"description": "GermanDPR is a German Question Answering dataset for open-domain QA. It associates "
"questions with a textual context containing the answer",
"reference": "https://www.deepset.ai/germanquad",
"type": "Retrieval",
"category": "s2p",
"eval_splits": [self._EVAL_SPLIT],
"eval_langs": [self._LANGUAGE],
"main_score": "ndcg_at_10",
"revision": "5129d02422a66be600ac89cd3e8531b4f97d347d",
}

@staticmethod
def _format_documents(docs, id_prefix="", existing_docs=None):
if existing_docs is None:
existing_docs = dict()
result = {}
for i, (title, content) in enumerate(zip(docs['title'], docs['text'])):
formatted_content = content.split('==\n')[-1].replace('\n', ' ').lstrip()
if formatted_content in existing_docs:
id_value = existing_docs[formatted_content]
else:
id_value = f"{id_prefix}{i}"
existing_docs[formatted_content] = id_value
result[id_value] = {"title": title, "text": formatted_content}
return result

def load_data(self, **kwargs):
if self.data_loaded:
return

data = datasets.load_dataset(
self.description["hf_hub_name"], revision=self.description.get("revision", None), split=self._EVAL_SPLIT
)
corpus = dict()
queries = dict()
relevant_docs = dict()
all_docs = dict()
for i, row in enumerate(data):
q_id = f'q_{i}'
queries[q_id] = row['question']
pos_docs = self._format_documents(row['positive_ctxs'], id_prefix=f"doc_{i}_p_", existing_docs=all_docs)
corpus.update(pos_docs)
neg_docs = self._format_documents(
row['hard_negative_ctxs'], id_prefix=f"doc_{i}_n_", existing_docs=all_docs
)
corpus.update(neg_docs)
relevant_docs[q_id] = {k: 1 for k in pos_docs}
self.queries = {self._EVAL_SPLIT: queries}
self.corpus = {self._EVAL_SPLIT: corpus}
self.relevant_docs = {self._EVAL_SPLIT: relevant_docs}

self.data_loaded = True
51 changes: 51 additions & 0 deletions mteb/tasks/Retrieval/WikiCLIRRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from collections import defaultdict

import ir_datasets

from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class WikiCLIRRetrieval(AbsTaskRetrieval):

_EVAL_SPLIT = 'test'

@property
def description(self):
return {
'name': 'WikiCLIR',
'ir_datasets_name': 'wikiclir/de',
'reference': 'https://ir-datasets.com/wikiclir#wikiclir/de',
'description': (
'A Cross-Language IR (CLIR) collection between English queries and German documents '
'built from Wikipedia. Queries are limited to the first 10k queries to reduce the '
'evaluation time.'
),
'type': 'Retrieval',
'category': 's2p',
'eval_splits': [self._EVAL_SPLIT],
'eval_langs': ['en-de'],
'main_score': 'ndcg_at_10',
}

def load_data(self, **kwargs):
if self.data_loaded:
return

dataset = ir_datasets.load(self.description['ir_datasets_name'])
# load first 10k queries
queries = defaultdict(dict)
for item in dataset.queries_iter():
if len(queries) < 10_000:
queries[item.query_id] = item.first_sent
# load corpus and qrels
qrel_dict = defaultdict(dict)
corpus = {item.doc_id: {'title': item.title, 'text': item.text} for item in dataset.docs_iter()}
for item in dataset.qrels_iter():
if item.query_id in queries.keys():
qrel_dict[item.query_id][item.doc_id] = item.relevance

self.queries = {self._EVAL_SPLIT: queries}
self.corpus = {self._EVAL_SPLIT: corpus}
self.relevant_docs = {self._EVAL_SPLIT: qrel_dict}

self.data_loaded = True
39 changes: 39 additions & 0 deletions mteb/tasks/Retrieval/XMarketRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from ...abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from ...abstasks.BeIRTask import BeIRTask

import datasets


class XMarket(AbsTaskRetrieval):
_EVAL_SPLIT = 'test'

@property
def description(self):
return {
"name": "XMarket",
"hf_hub_name": "jinaai/xmarket_de",
"description": "XMarket is an ecommerce category to product retrieval dataset in German.",
"reference": "https://xmrec.github.io/",
"type": "Retrieval",
"category": "s2p",
"eval_splits": ["test"],
"eval_langs": ["de"],
"main_score": "ndcg_at_10",
"revision": "2336818db4c06570fcdf263e1bcb9993b786f67a",
}

def load_data(self, **kwargs):
if self.data_loaded:
return

query_rows = datasets.load_dataset(self.description["hf_hub_name"], "queries", split=self._EVAL_SPLIT)
corpus_rows = datasets.load_dataset(self.description["hf_hub_name"], "corpus", split=self._EVAL_SPLIT)
qrels_rows = datasets.load_dataset(self.description["hf_hub_name"], "qrels", split=self._EVAL_SPLIT)

self.queries = {self._EVAL_SPLIT: {row["_id"]: row["text"] for row in query_rows}}
self.corpus = {self._EVAL_SPLIT: {row["_id"]: row for row in corpus_rows}}
self.relevant_docs = {
self._EVAL_SPLIT: {row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows}
}

self.data_loaded = True
5 changes: 5 additions & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@
from .SCIDOCSPLRetrieval import *
from .SciFactPLRetrieval import *
from .TRECCOVIDPLRetrieval import *
from .WikiCLIRRetrieval import *
from .GerDaLIRRetrieval import *
from .GermanDPRRetrieval import *
from .XMarketRetrieval import *

21 changes: 21 additions & 0 deletions mteb/tasks/STS/GermanSTSBenchmarkSTS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from ...abstasks.AbsTaskSTS import AbsTaskSTS


class GermanSTSBenchmarkSTS(AbsTaskSTS):
@property
def description(self):
return {
"name": "GermanSTSBenchmark",
"hf_hub_name": "jinaai/german-STSbenchmark",
"description": "Semantic Textual Similarity Benchmark (STSbenchmark) dataset translated into German. "
"Translations were originally done by T-Systems on site services GmbH.",
"reference": "https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark",
"type": "STS",
"category": "s2s",
"eval_splits": ["validation", "test"],
"eval_langs": ["de"],
"main_score": "cosine_spearman",
"min_score": 0,
"max_score": 5,
"revision": "49d9b423b996fea62b483f9ee6dfb5ec233515ca",
}
1 change: 1 addition & 0 deletions mteb/tasks/STS/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
from .STSBenchmarkSTS import *
from .CMTEBSTS import *
from .PolishSTS import *
from .GermanSTSBenchmarkSTS import *
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ scipy
sentence_transformers>=2.2.0
torch
tqdm
rich
rich
ir_datasets==0.5.5
Loading