diff --git a/seacrowd/sea_datasets/mkqa/__init__.py b/seacrowd/sea_datasets/mkqa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/mkqa/mkqa.py b/seacrowd/sea_datasets/mkqa/mkqa.py new file mode 100644 index 000000000..009f6d1a9 --- /dev/null +++ b/seacrowd/sea_datasets/mkqa/mkqa.py @@ -0,0 +1,247 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{longpre-etal-2021-mkqa, + title = "{MKQA}: A Linguistically Diverse Benchmark for Multilingual Open Domain Question Answering", + author = "Longpre, Shayne and + Lu, Yi and + Daiber, Joachim", + editor = "Roark, Brian and + Nenkova, Ani", + journal = "Transactions of the Association for Computational Linguistics", + volume = "9", + year = "2021", + address = "Cambridge, MA", + publisher = "MIT Press", + url = "https://aclanthology.org/2021.tacl-1.82", + doi = "10.1162/tacl_a_00433", + pages = "1389--1406", +} +""" + +_DATASETNAME = "mkqa" + +_DESCRIPTION = """\ +Multilingual Knowledge Questions and Answers (MKQA), an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages (260k question-answer pairs in total) +""" + +_HOMEPAGE = "https://github.com/apple/ml-mkqa" + +_LICENSE = Licenses.CC_BY_SA_3_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/apple/ml-mkqa/raw/main/dataset/mkqa.jsonl.gz", +} + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MKQADataset(datasets.GeneratorBasedBuilder): + """ + MKQA, an open-domain question answering evaluation set comprising 10k question-answer pairs + aligned across 26 typologically diverse languages (260k question-answer pairs in total). + The goal of this dataset is to provide a challenging benchmark for question answering quality + across a wide set of languages. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + LANGUAGES = [ + "", + "khm", + "msa", + "tha", + "vie", + ] # follows the convention of 3-letter code as suggested since NusaCrowd. + + _SOURCE_LANGUAGES = [ + "ar", + "da", + "de", + "en", + "es", + "fi", + "fr", + "he", + "hu", + "it", + "ja", + "ko", + "km", + "ms", + "nl", + "no", + "pl", + "pt", + "ru", + "sv", + "th", + "tr", + "vi", + "zh_cn", + "zh_hk", + "zh_tw", + ] + + _LANG_3TO2 = { + "khm": "km", + "msa": "ms", + "tha": "th", + "vie": "vi", + } + + BUILDER_CONFIGS = [ + *[ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset_lang}{'_' if subset_lang else ''}source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{subset_lang}", + ) + for subset_lang in LANGUAGES + ], + *[ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset_lang}{'_' if subset_lang else ''}seacrowd_qa", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{_DATASETNAME}_{subset_lang}", + ) + for subset_lang in LANGUAGES + ], + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + lang = self.config.subset_id.rsplit("_", 1)[-1] + lang = self._LANG_3TO2.get(lang, lang) + + if self.config.schema == "source": + features = datasets.Features( + { + "query": datasets.Value("string"), + "answers": { + cur_lang: [ + { + "type": datasets.ClassLabel( + names=[ + "binary", + "date", + "entity", + "long_answer", + "number", + "number_with_unit", + "short_phrase", + "unanswerable", + ] + ), + "entity": datasets.Value("string"), + "text": datasets.Value("string"), + "aliases": [datasets.Value("string")], + } + ] + for cur_lang in ([lang] if lang else self._SOURCE_LANGUAGES) + }, + "queries": {cur_lang: datasets.Value("string") for cur_lang in ([lang] if lang else self._SOURCE_LANGUAGES)}, + "example_id": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + features["meta"]["answer_entity"] = datasets.Sequence(datasets.Value("string")) + features["meta"]["answer_aliases"] = datasets.Sequence(datasets.Sequence(datasets.Value("string"))) + + else: # schema not found! should NOT reach here ... + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_path = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_path}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + lang = self.config.subset_id.rsplit("_", 1)[-1] + lang = self._LANG_3TO2.get(lang, lang) + + datas = [] + with open(filepath, "r", encoding="utf8") as ipt: + for cur in map(json.loads, ipt): + cur["example_id"] = str(cur["example_id"]) + if lang: + for key in ["answers", "queries"]: + cur[key] = {k: v for k, v in cur[key].items() if k == lang} + datas.append(cur) + + if self.config.schema == "source": + for cur in datas: + for anslist in cur["answers"].values(): + for ans in anslist: + ans.setdefault("entity", "") + ans.setdefault("aliases", []) + yield int(cur["example_id"]), cur + + elif self.config.schema == "seacrowd_qa": + for cur in datas: + for cur_lang in [lang] if lang else self._SOURCE_LANGUAGES: + ret = { + "id": f'{cur["example_id"]}_{cur_lang}', + "question_id": cur["example_id"], + "document_id": "", + "question": cur["queries"][cur_lang], + "type": [ans["type"] for ans in cur["answers"][cur_lang]], + "choices": [], + "context": "", + "answer": [ans.get("text", None) for ans in cur["answers"][cur_lang]], + # "meta": {}, + "meta": {f"answer_{k}": [ans.get(k, None) for ans in cur["answers"][cur_lang]] for k in ["entity", "aliases"]}, + } + yield ret["id"], ret