-
Notifications
You must be signed in to change notification settings - Fork 44
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Prepare addition of the TREC Tip-of-the-Tongue dataset #235 * Prepare addition of the TREC Tip-of-the-Tongue dataset #235 * a few tweaks * mf * title type * documentation * fix yaml error in other file * typing * rename trec-tip-of-the-tongue to trec-tot and added year * rename trec-tip-of-the-tongue to trec-tot and added year * rename trec-tip-of-the-tongue to trec-tot and added year --------- Co-authored-by: Maik Fröbe <[email protected]>
- Loading branch information
1 parent
a1c0863
commit d5b7ce3
Showing
7 changed files
with
163 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import ir_datasets | ||
from ir_datasets.util import ZipExtract, Cache, Lazy, DownloadConfig | ||
from ir_datasets.formats import TrecQrels, JsonlQueries, JsonlDocs, TrecQrels | ||
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation, Deprecated | ||
from typing import NamedTuple, List, Dict | ||
|
||
NAME = 'trec-tot' | ||
|
||
|
||
class TipOfTheTongueDoc(NamedTuple): | ||
doc_id: str | ||
page_title: str | ||
wikidata_id: str | ||
wikidata_classes: List[str] | ||
text: str | ||
sections: Dict[str, str] | ||
infoboxes: List[Dict[str, str]] | ||
|
||
def default_text(self): | ||
""" | ||
We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need. | ||
""" | ||
return self.page_title + ' ' + self.text | ||
|
||
|
||
class TipOfTheTongueQuery(NamedTuple): | ||
query_id: str | ||
url: str | ||
domain: str | ||
title: str | ||
text: str | ||
sentence_annotations: List[Dict[str, str]] | ||
|
||
def default_text(self): | ||
return self.title + ' ' + self.text | ||
|
||
|
||
QUERY_MAP = {'query_id': 'id', 'url': 'url', 'domain': 'domain', 'title': 'title', 'text': 'text', 'sentence_annotations': 'sentence_annotations'} | ||
|
||
|
||
def _init(): | ||
documentation = YamlDocumentation(f'docs/{NAME}.yaml') | ||
base_path = ir_datasets.util.home_path()/NAME | ||
dlc = DownloadConfig.context(NAME, base_path) | ||
subsets = {} | ||
|
||
main_dlc = dlc['main'] | ||
base = Dataset( | ||
documentation('_'), | ||
) | ||
ir_datasets.registry.register(NAME, base) | ||
|
||
docs_2023_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'TREC-TOT/corpus.jsonl'), base_path/'2023/corpus.jsonl'), doc_cls=TipOfTheTongueDoc, lang='en') | ||
subsets['2023'] = Dataset( | ||
docs_2023_handler, | ||
documentation('2023'), | ||
) | ||
ir_datasets.registry.register(f'{NAME}/2023', subsets['2023']) | ||
for s in ['train', 'dev']: | ||
subsets[f'2023/{s}'] = Dataset( | ||
docs_2023_handler, | ||
JsonlQueries(Cache(ZipExtract(main_dlc, f'TREC-TOT/{s}/queries.jsonl'), base_path/f'2023/{s}/queries.jsonl'), query_cls=TipOfTheTongueQuery, mapping=QUERY_MAP, lang='en'), | ||
TrecQrels(Cache(ZipExtract(main_dlc, f'TREC-TOT/{s}/qrel.txt'), base_path/f'2023/{s}/qrel.txt'), {0: 'Not Relevant', 1: 'Relevant'}), | ||
documentation(f'2023/{s}'), | ||
) | ||
ir_datasets.registry.register(f'{NAME}/2023/{s}', subsets[f'2023/{s}']) | ||
|
||
return base, subsets | ||
|
||
|
||
base, subsets = _init() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
_: | ||
pretty_name: 'TREC Tip-of-the-Tongue' | ||
desc: ' | ||
<p> | ||
Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details <a href="https://trec-tot.github.io/guidelines">are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track</a>. | ||
</p> | ||
' | ||
|
||
2023: | ||
desc: ' | ||
<p> | ||
Corpus for the TREC 2023 tip-of-the-tongue search track. | ||
</p> | ||
' | ||
|
||
2023/train: | ||
desc: ' | ||
<p> | ||
Train query set for TREC 2023 tip-of-the-tongue search track. | ||
</p> | ||
' | ||
|
||
2023/dev: | ||
desc: ' | ||
<p> | ||
Dev query set for TREC 2023 tip-of-the-tongue search track. | ||
</p> | ||
' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.