forked from SEACrowd/seacrowd-datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
…EACrowd#133) * implement burmese_romanize dataloader * add _LANGUAGES
- Loading branch information
1 parent
6f7a0d4
commit 8a9512f
Showing
2 changed files
with
121 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
121 changes: 121 additions & 0 deletions
121
seacrowd/sea_datasets/burmese_romanize/burmese_romanize.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
from pathlib import Path | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
import pandas as pd | ||
|
||
from seacrowd.utils import schemas | ||
from seacrowd.utils.configs import SEACrowdConfig | ||
from seacrowd.utils.constants import Tasks, Licenses | ||
|
||
_CITATION = """\ | ||
@inproceedings{chenchen2017statistical, | ||
author = {Ding Chenchen and | ||
Chea Vichet and | ||
Pa Win Pa and | ||
Utiyama Masao and | ||
Sumita Eiichiro}, | ||
title = {Statistical Romanization for Abugida Scripts: Data and Experiment on Khmer and Burmese}, | ||
booktitle = {Proceedings of the 23rd Annual Conference of the Association for Natural Language Processing, | ||
{NLP2017}, Tsukuba, Japan, 13-17 March 2017}, | ||
year = {2017}, | ||
url = {https://www.anlp.jp/proceedings/annual_meeting/2017/pdf_dir/P5-7.pdf}, | ||
} | ||
""" | ||
|
||
_DATASETNAME = "burmese_romanize" | ||
_DESCRIPTION = """\ | ||
This dataset consists of 2,335 Burmese names from real university students and faculty, | ||
public figures, and minorities from Myanmar. Each entry includes the original name in | ||
Burmese script, its corresponding Romanization, and the aligned Burmese and Latin | ||
graphemes. | ||
""" | ||
|
||
_HOMEPAGE = "http://www.nlpresearch-ucsy.edu.mm/NLP_UCSY/name-db.html" | ||
_LANGUAGES = ["mya"] | ||
_LICENSE = Licenses.CC_BY_NC_SA_4_0.value | ||
_URLS = "http://www.nlpresearch-ucsy.edu.mm/NLP_UCSY/myanmaroma.zip" | ||
|
||
_SUPPORTED_TASKS = [Tasks.TRANSLITERATION] | ||
_SOURCE_VERSION = "1.0.0" | ||
_SEACROWD_VERSION = "1.0.0" | ||
|
||
|
||
class BurmeseRomanizeDataset(datasets.GeneratorBasedBuilder): | ||
"""Romanization of names in Burmese script""" | ||
|
||
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) | ||
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) | ||
|
||
SEACROWD_SCHEMA_NAME = "t2t" | ||
|
||
BUILDER_CONFIGS = [ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_source", | ||
version=SOURCE_VERSION, | ||
description=f"{_DATASETNAME} source schema", | ||
schema="source", | ||
subset_id=f"{_DATASETNAME}", | ||
), | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", | ||
version=SEACROWD_VERSION, | ||
description=f"{_DATASETNAME} SEACrowd schema", | ||
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", | ||
subset_id=f"{_DATASETNAME}", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
if self.config.schema == "source": | ||
features = datasets.Features( | ||
{ | ||
"original": datasets.Value("string"), | ||
"romanized": datasets.Value("string"), | ||
"aligned_graphemes": datasets.Sequence(datasets.Value("string")), | ||
} | ||
) | ||
elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": | ||
features = schemas.text2text_features | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: | ||
data_dir = Path(dl_manager.download_and_extract(_URLS)) / "myanmaroma" | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"filepath": data_dir / "myanmaroma.txt", | ||
"split": "train", | ||
}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: | ||
df = pd.read_csv(filepath, sep=" \|\|\| ", engine='python', header=None, names=["ori", "roman", "seg"]) | ||
if self.config.schema == "source": | ||
for i, row in df.iterrows(): | ||
yield i, { | ||
"original": row["ori"], | ||
"romanized": row["roman"], | ||
"aligned_graphemes": row["seg"].strip().split(), | ||
} | ||
elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": | ||
for i, row in df.iterrows(): | ||
yield i, { | ||
"id": str(i), | ||
"text_1": row["ori"], | ||
"text_2": row["roman"], | ||
"text_1_name": "original", | ||
"text_2_name": "romanized", | ||
} |