Skip to content

Commit

Permalink
trec-tot (#238)
Browse files Browse the repository at this point in the history
* Prepare addition of the TREC Tip-of-the-Tongue dataset #235

* Prepare addition of the TREC Tip-of-the-Tongue dataset #235

* a few tweaks

* mf

* title type

* documentation

* fix yaml error in other file

* typing

* rename trec-tip-of-the-tongue to trec-tot and added year

* rename trec-tip-of-the-tongue to trec-tot and added year

* rename trec-tip-of-the-tongue to trec-tot and added year

---------

Co-authored-by: Maik Fröbe <[email protected]>
  • Loading branch information
seanmacavaney and mam10eks authored Jun 15, 2023
1 parent a1c0863 commit d5b7ce3
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 2 deletions.
3 changes: 2 additions & 1 deletion ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from . import trec_mandarin
from . import trec_spanish
from . import trec_robust04
from . import trec_tot
from . import tripclick
from . import tweets2013_ia
from . import vaswani
Expand All @@ -51,4 +52,4 @@
from . import trec_cast # must be after wapo,car,msmarco_passage
from . import hc4
from . import neuclir # must be after hc4
from . import sara
from . import sara
71 changes: 71 additions & 0 deletions ir_datasets/datasets/trec_tot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import ir_datasets
from ir_datasets.util import ZipExtract, Cache, Lazy, DownloadConfig
from ir_datasets.formats import TrecQrels, JsonlQueries, JsonlDocs, TrecQrels
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation, Deprecated
from typing import NamedTuple, List, Dict

NAME = 'trec-tot'


class TipOfTheTongueDoc(NamedTuple):
doc_id: str
page_title: str
wikidata_id: str
wikidata_classes: List[str]
text: str
sections: Dict[str, str]
infoboxes: List[Dict[str, str]]

def default_text(self):
"""
We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need.
"""
return self.page_title + ' ' + self.text


class TipOfTheTongueQuery(NamedTuple):
query_id: str
url: str
domain: str
title: str
text: str
sentence_annotations: List[Dict[str, str]]

def default_text(self):
return self.title + ' ' + self.text


QUERY_MAP = {'query_id': 'id', 'url': 'url', 'domain': 'domain', 'title': 'title', 'text': 'text', 'sentence_annotations': 'sentence_annotations'}


def _init():
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
subsets = {}

main_dlc = dlc['main']
base = Dataset(
documentation('_'),
)
ir_datasets.registry.register(NAME, base)

docs_2023_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'TREC-TOT/corpus.jsonl'), base_path/'2023/corpus.jsonl'), doc_cls=TipOfTheTongueDoc, lang='en')
subsets['2023'] = Dataset(
docs_2023_handler,
documentation('2023'),
)
ir_datasets.registry.register(f'{NAME}/2023', subsets['2023'])
for s in ['train', 'dev']:
subsets[f'2023/{s}'] = Dataset(
docs_2023_handler,
JsonlQueries(Cache(ZipExtract(main_dlc, f'TREC-TOT/{s}/queries.jsonl'), base_path/f'2023/{s}/queries.jsonl'), query_cls=TipOfTheTongueQuery, mapping=QUERY_MAP, lang='en'),
TrecQrels(Cache(ZipExtract(main_dlc, f'TREC-TOT/{s}/qrel.txt'), base_path/f'2023/{s}/qrel.txt'), {0: 'Not Relevant', 1: 'Relevant'}),
documentation(f'2023/{s}'),
)
ir_datasets.registry.register(f'{NAME}/2023/{s}', subsets[f'2023/{s}'])

return base, subsets


base, subsets = _init()
3 changes: 2 additions & 1 deletion ir_datasets/docs/sara.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ A set of sensitivity-aware relevance assessments. More information is avaliable
<ul>
<li><a href="https://github.com/JackMcKechnie/SARA-A-Collection-of-Sensitivity-Aware-Relevance-Assessments">SARA</a></li>
</ul>
</ul>
'
28 changes: 28 additions & 0 deletions ir_datasets/docs/trec-tot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
_:
pretty_name: 'TREC Tip-of-the-Tongue'
desc: '
<p>
Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details <a href="https://trec-tot.github.io/guidelines">are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track</a>.
</p>
'

2023:
desc: '
<p>
Corpus for the TREC 2023 tip-of-the-tongue search track.
</p>
'

2023/train:
desc: '
<p>
Train query set for TREC 2023 tip-of-the-tongue search track.
</p>
'

2023/dev:
desc: '
<p>
Dev query set for TREC 2023 tip-of-the-tongue search track.
</p>
'
8 changes: 8 additions & 0 deletions ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -3845,6 +3845,14 @@
"cache_path": "trec4/qrels.gz"
}
},

"trec-tot": {
"main": {
"url": "https://surfdrive.surf.nl/files/index.php/s/FaEK4xc6Xp2JcAJ/download",
"expected_md5": "f84fe82cb80e3ee1072576c8d6c4a417",
"cache_path": "trec-tot.zip"
}
},

"tripclick": {
"benchmark": {
Expand Down
3 changes: 3 additions & 0 deletions ir_datasets/etc/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,9 @@
"trec-spanish": {"docs": {"count": 120605, "fields": {"doc_id": {"max_len": 13, "common_prefix": ""}}}},
"trec-spanish/trec3": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 19005, "fields": {"relevance": {"counts_by_value": {"1": 4766, "0": 14239}}}}},
"trec-spanish/trec4": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 13109, "fields": {"relevance": {"counts_by_value": {"1": 2202, "0": 10907}}}}},
"trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
"trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
"tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"tripclick/logs": {"docs": {"count": 5196956, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "qlogs": {"count": 5317350}},
"tripclick/test": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3525}, "scoreddocs": {"count": 3486402}},
Expand Down
49 changes: 49 additions & 0 deletions test/integration/trec_tot.py

Large diffs are not rendered by default.

0 comments on commit d5b7ce3

Please sign in to comment.