trec-tot (#238)

* Prepare addition of the TREC Tip-of-the-Tongue dataset #235 * Prepare addition of the TREC Tip-of-the-Tongue dataset #235 * a few tweaks * mf * title type * documentation * fix yaml error in other file * typing * rename trec-tip-of-the-tongue to trec-tot and added year * rename trec-tip-of-the-tongue to trec-tot and added year * rename trec-tip-of-the-tongue to trec-tot and added year --------- Co-authored-by: Maik Fröbe <[email protected]>
allenai · Jun 15, 2023 · d5b7ce3 · d5b7ce3
1 parent a1c0863
commit d5b7ce3
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 2 deletions.
diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
@@ -41,6 +41,7 @@
 from . import trec_mandarin
 from . import trec_spanish
 from . import trec_robust04
+from . import trec_tot
 from . import tripclick
 from . import tweets2013_ia
 from . import vaswani
@@ -51,4 +52,4 @@
 from . import trec_cast # must be after wapo,car,msmarco_passage
 from . import hc4
 from . import neuclir # must be after hc4
-from . import sara
+from . import sara
diff --git a/ir_datasets/datasets/trec_tot.py b/ir_datasets/datasets/trec_tot.py
@@ -0,0 +1,71 @@
+import ir_datasets
+from ir_datasets.util import ZipExtract, Cache, Lazy, DownloadConfig
+from ir_datasets.formats import TrecQrels, JsonlQueries, JsonlDocs, TrecQrels
+from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation, Deprecated
+from typing import NamedTuple, List, Dict
+
+NAME = 'trec-tot'
+
+
+class TipOfTheTongueDoc(NamedTuple):
+    doc_id: str
+    page_title: str
+    wikidata_id: str
+    wikidata_classes: List[str]
+    text: str
+    sections: Dict[str, str]
+    infoboxes: List[Dict[str, str]]
+
+    def default_text(self):
+        """
+        We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need.
+        """
+        return self.page_title + ' ' + self.text
+
+
+class TipOfTheTongueQuery(NamedTuple):
+    query_id: str
+    url: str
+    domain: str
+    title: str
+    text: str
+    sentence_annotations: List[Dict[str, str]]
+
+    def default_text(self):
+        return self.title + ' ' + self.text
+
+
+QUERY_MAP = {'query_id': 'id', 'url': 'url', 'domain': 'domain', 'title': 'title', 'text': 'text', 'sentence_annotations': 'sentence_annotations'}
+
+
+def _init():
+    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+    base_path = ir_datasets.util.home_path()/NAME
+    dlc = DownloadConfig.context(NAME, base_path)
+    subsets = {}
+
+    main_dlc = dlc['main']
+    base = Dataset(
+        documentation('_'),
+    )
+    ir_datasets.registry.register(NAME, base)
+
+    docs_2023_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'TREC-TOT/corpus.jsonl'), base_path/'2023/corpus.jsonl'), doc_cls=TipOfTheTongueDoc, lang='en')
+    subsets['2023'] = Dataset(
+        docs_2023_handler,
+        documentation('2023'),
+    )
+    ir_datasets.registry.register(f'{NAME}/2023', subsets['2023'])
+    for s in ['train', 'dev']:
+        subsets[f'2023/{s}'] = Dataset(
+            docs_2023_handler,
+            JsonlQueries(Cache(ZipExtract(main_dlc, f'TREC-TOT/{s}/queries.jsonl'), base_path/f'2023/{s}/queries.jsonl'), query_cls=TipOfTheTongueQuery, mapping=QUERY_MAP, lang='en'),
+            TrecQrels(Cache(ZipExtract(main_dlc, f'TREC-TOT/{s}/qrel.txt'), base_path/f'2023/{s}/qrel.txt'), {0: 'Not Relevant', 1: 'Relevant'}),
+            documentation(f'2023/{s}'),
+        )
+        ir_datasets.registry.register(f'{NAME}/2023/{s}', subsets[f'2023/{s}'])
+
+    return base, subsets
+
+
+base, subsets = _init()
diff --git a/ir_datasets/docs/sara.yaml b/ir_datasets/docs/sara.yaml
@@ -7,4 +7,5 @@ A set of sensitivity-aware relevance assessments. More information is avaliable
 
 <ul>
 <li><a href="https://github.com/JackMcKechnie/SARA-A-Collection-of-Sensitivity-Aware-Relevance-Assessments">SARA</a></li>
-</ul>
+</ul>
+'
diff --git a/ir_datasets/docs/trec-tot.yaml b/ir_datasets/docs/trec-tot.yaml
@@ -0,0 +1,28 @@
+_:
+  pretty_name: 'TREC Tip-of-the-Tongue'
+  desc: '
+<p>
+Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details <a href="https://trec-tot.github.io/guidelines">are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track</a>.
+</p>
+'
+
+2023:
+  desc: '
+<p>
+Corpus for the TREC 2023 tip-of-the-tongue search track.
+</p>
+'
+
+2023/train:
+  desc: '
+<p>
+Train query set for TREC 2023 tip-of-the-tongue search track.
+</p>
+'
+
+2023/dev:
+  desc: '
+<p>
+Dev query set for TREC 2023 tip-of-the-tongue search track.
+</p>
+'
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
@@ -3845,6 +3845,14 @@
       "cache_path": "trec4/qrels.gz"
     }
   },
+
+  "trec-tot": {
+    "main": {
+      "url": "https://surfdrive.surf.nl/files/index.php/s/FaEK4xc6Xp2JcAJ/download",
+      "expected_md5": "f84fe82cb80e3ee1072576c8d6c4a417",
+      "cache_path": "trec-tot.zip"
+    }
+  },
 
   "tripclick": {
     "benchmark": {

diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
@@ -560,6 +560,9 @@
   "trec-spanish": {"docs": {"count": 120605, "fields": {"doc_id": {"max_len": 13, "common_prefix": ""}}}},
   "trec-spanish/trec3": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 19005, "fields": {"relevance": {"counts_by_value": {"1": 4766, "0": 14239}}}}},
   "trec-spanish/trec4": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 13109, "fields": {"relevance": {"counts_by_value": {"1": 2202, "0": 10907}}}}},
+  "trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
+  "trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
+  "trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
   "tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
   "tripclick/logs": {"docs": {"count": 5196956, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "qlogs": {"count": 5317350}},
   "tripclick/test": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3525}, "scoreddocs": {"count": 3486402}},

diff --git a/test/integration/trec_tot.py b/test/integration/trec_tot.py
-Original file line number
+Diff line change
@@ Expand Up @@
     <ul>
     <li><a href="https://github.com/JackMcKechnie/SARA-A-Collection-of-Sensitivity-Aware-Relevance-Assessments">SARA</a></li>
-    </ul>
+    </ul>
+    '