From 34e174925f26e58c1df125cf9ce3c02811298498 Mon Sep 17 00:00:00 2001 From: Jonibek Mansurov <44943993+MJonibek@users.noreply.github.com> Date: Thu, 18 Apr 2024 18:39:24 +0400 Subject: [PATCH] Update (#2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix bug unique ids * Closes #162 | Add Bloom-Captioning Dataloader (#198) * Init dataloader bloom captioning * Fix issue on multiple splits from its source * Change local var * Cater 'test' and 'val' split and fix the '_id' generation * fix: remove abstreact and change _LOCAL and _DESC * fix: _DESC indent * Format openslr.py and add init file * Closes #271 | Implement dataloader for UiT-ViCTSD (#300) * Implement UiT-ViCTSD dataloader * Improve subset IDs, feature types, code to generate examples * Closes #161 | Create dataset loader for ICON 161 (#317) * Create icon.py * Update icon.py * Create __init__.py * Closes #142 | Add Unimorph v4 dataloader (#168) * Add Unimorph dataloader Resolves #142 * Add Dataset to class name * Closes #71 | Create dataset loader for MASSIVE (#196) * add data loader for massive dataset * modify the class name & refactor the function name * change task name from pos tagging to slot filling & make check_file & change subset name to differentiate intent / slot filling tasks * Closes #14 | Create dataset loader for ara-close-lange (#243) * Add ara_close dataloader * Rename class name to AraCloseDataset * Closes #273 | Implement dataloader for UIT_ViON (#282) * Implement dataloader for UIT_ViON * Add __init__.py * Add {lang} in subset id for openslr * Closes #219 | Create dataloader for scb-mt-en-th-2020 (#287) * Create dataloader for scb-mt-en-th-2020 * Rename the data loader files to its snakecase * rename _DATASETNAME to snakecase * Fix languages setting * Update template.py * Add docstring openslr.py * Closes #277 | Implement dataloader for spamid_pair (#281) * Implemente dataloader for spamid_pair * Update seacrowd/sea_datasets/spamid_pair/spamid_pair.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Add __init__.py * Update __init__.py --------- Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Implemented dataloader for indoler * Add imqa schema and VISUAL_QUESTION_ANSWERING task (#380) * Update template.py Update DownloadManager documentation link in template.py * Closes #54 | Implement Dataloader for IndoSMD (#258) * feat: indosmd dataloader for source * refactor by pre-commit * IndoSMD: reformatted by pre-commit * Update changes on indosmd.py * revised line 223 in indosmd.py * Close#143 | Create dataset loader for Abui WordNet (#285) * add tydiqa dataloader * add id_vaccines_tweet dataloader * add uit-vicc dataloader * add ICON dataloader * add iaap_squad dataloader * add stb_ext dataloader * Revert "add iaap_squad dataloader" This reverts commit 1f8a5913e6505e1da402ec75816d250e8b6565c3. * Revert "add tydiqa dataloader" This reverts commit 6bf4546ba956081f7400c6a01f3804ab3d6d0f5c. * Revert "add id_vaccines_tweet dataloader" This reverts commit 1154087ca9f95eabf93c3afc732dda11876f47fc. * Revert "add uit-vicc dataloader" This reverts commit 09661fa7addf48247c47cb8e124b08d85d99fbd6. * Revert "add ICON dataloader" This reverts commit 0891e58ee46b57dc052fcd88d11cb5c5f10d7e10. * Update stb_ext.py * add abui_wordnet dataloader * Revert "Update stb_ext.py" This reverts commit 59c530160265829329aebdf3a41bfd906e5e5930. * Delete seacrowd/sea_datasets/stb_ext/stb_ext.py * Delete seacrowd/sea_datasets/stb_ext/__init__.py * Update abui_wordnet.py * Update abui_wordnet.py * Update abui_wordnet.py --------- Co-authored-by: Lj Miranda Co-authored-by: Samuel Cahyawijaya * Added Morality Classification Tasks to constants.py (#371) * Closes #216 | Create dataset loader for Mozilla Pontoon (#260) * Begin first draft of Mozilla Pontoon dataloader * Add dataloader for Mozilla Pontoon * Remove enumerate in _generate_examples * Fix issues due to changed format, rename features and config names * Closes #157 | Create dataset loader for M3Exam (#302) * Add m3exam dataloader * Small change in m3exam.py * Fix bug during downloading * Add meta feature to seacrowd schema for m3exam * Rename class M3Exam to M3ExamDataset * Add image question answering * Merge two source schemas into one for m3exam * Fix image path, choices and answer in m3exam * Update CODEOWNERS * Rectify SEACrowd Internal Vars (#386) * Add missing __init__.py * add init * fix bug in phoatis load * add lang variables in dataloaders * Add dataset use ack on source HF repo into description * Closes #204 | Implement dataloader for Melayu_Sabah (#234) * Implement dataloader for Melayu_Sabah * Update name for the dataloader * Add _CITATION * Update seacrowd/sea_datasets/melayu_sabah/melayu_sabah.py * Applu suggestions from review * Moving unnecessary content in dialogue text * Update melayu_sabah.py * Improvement: Workflow Message to Mention Assignee in Staled Issues (#400) * Update stale.yml (#327) * Update stale.yml Test on adding vars on assignee & author of Issues & PR * Update stale.yml * Update stale.yml * Update stale.yml * Update stale.yml * Update stale.yml * Closes #272 | Create dataset loader for SNLI (#290) * [New Feature] Add SNLI dataloader * [Fix] SNLI rev according to PR review * [Chore] Add comment for accessibility * Update common_parser.py (#333) * Implement dataloader for UCLA Phonetic Corpus * Implement dataloader for KDE4 * removed redundant builder_config * Update cc3m_35l.py Changed into no parallelization since it was kept being killed by the OS for some reason. * Fix: Workflow Assignee Mention (#410) * Update stale.yml * Fix: wrong quote in message (#411) * Update and fix bug on stale.yml * Closes #17 | Implement dataloader for Philippine Fake News Corpus (#331) * Implement dataloader * Edit dataloader class name * Simplify code * Fix citation typo * Closes #359 | Implement dataloader for LR-Sum (#368) * Implement dataloader * Fix short description * feat: mswc dataloader skeleton * feat: example for seacrowd schema * Closes #265 | Implement dataloader for `myxnli` (#336) * Implement dataloader for myxnli * update myxnli * Closes #112 | Implement Dataloader for Wisesight Thai Corpus (#279) * Add wisesight_thai_sentiment dataset * changes according to review * changes according to review * changes according to review * Add changes according to review * refactor: formatting * fix: subset * refactor: formatting * Closes #6 | Add Loader for XCOPA (#286) * initial add for loader * edit to include multi language * adjust comments * apply suggestion * fix by linter --------- Co-authored-by: fawwaz.mayda * Closes #140 | Add Dengue Filipino (#259) * add dengue filipino * update license and tasks * Update _LANGUAGE * Update dengue_filipino.py * feat: flores200 dataloader skeleton * Set only one source schema * Fix subnodes ids for root node alt_burmese_treebank * implement Filipino Gay Language dataloader (#66) * convert citation to raw string * Closes #210 | Create dataset loader for Orchid Corpus (#303) * Add orchid_pos dataloader * Rename OrchidPOS to OrchidPOSDataset * Fix parser bug in orchid_pos.py * Add .strip() in source orchid_pos * Cahange string for special char orchid_pos * fix: remove useless loop * refactor: remove unused loop * Closes #159 | Create dataset loader for CC-Aligned (#298) * Add cc_aligned_doc dataloader * Rename class and format cc_aligned_doc * Add SEACROWD_SCHEMA_NAME for cc_aligned_doc * Closes #268 | Implement dataloader for Thai Toxicity Tweet Corpus (#301) * Implement dataloader for Thai toxicity tweets * Fix description grammar * List labels as constant * Change task to ABUSIVE_LANGUAGE_PREDICTION, improve _generate_examples * Rename dataloader folder and file * Remove comment, change license value * Define SEACROWD_SCHEMA using _SUPPORTED_TASKS * Fix bug where example ID and index do not match * Closes #363 | Create dataset loader for identifikasi-bahasa (#379) * [add] initial commit * [add] dataset loader for identifikasi_bahasa * [refactor] removed __main__ * Update seacrowd/sea_datasets/identifikasi_bahasa/identifikasi_bahasa.py --------- Co-authored-by: Amir Djanibekov * Closes #182. | Implement dataloader for `roots_vi_ted` (#329) * Implement dataloader for roots_vi_ted * update * update * update * remove local data * reformat * Closes #180 | Implement `IndoMMLU` dataloader (#324) * Implement dataloader for indommlu * update * update * Closes #345 | Implemented dataloader for vlsp2016_ner (#372) * Implemented dataloader for vlsp2016_ner * Format vlsp2016_ner.py * Closes #276 | Implement PRDECT-ID dataloader (#322) * Implement PRDECT-ID dataloader Closes #276 * Add better type formatting * Follow id_google_play_review for structure * Include source configs for both emotion and sentiment * Closes #9 | Add bhinneka_korpus dataset loader (#175) * Add bhinnek_korpus dataset loader * Updating the suggested changes * Resolved review suggestions * Create indonesian_news_dataset dataloader * Closes #183 | Implement `wongnai_reviews` dataloader (#325) * Implement dataloader for wongnai_reviews * add __init__.py * update * update * Implement change requested by holylovenia * Closes #348 | Implemented dataloader for indoner_tourism (#373) * Implemented dataloader for indoner_tourism * Perform changes requested by ljvmiranda921 * Closes #361 | Create dataset loader for Thai-Lao Parallel Corpus (#384) * [add] dataloader for tha_lao_embassy_parcor, no citation yet * [add] citation; removed debug code * [style] make format restyle * [refactor] removed TODO code --------- Co-authored-by: Amir Djanibekov * Update constants.py * Closes #305 | Implement dataloader for UIT_ViOCD (#335) * Implement dataloader for UIT_ViOCD * update according to the review * Update _SUPPORTED_TASKS * Closes #362 | Create dataset loader for GKLMIP Khmer News Dataset (#383) * [add] dataloader for gklmip_newsclass * [refactor] changed licence value --------- Co-authored-by: Amir Djanibekov * Closes #358 | Create dataset loader for GKLMIP Product Sentiment (#417) * [add] dataset loader for gklmip_sentiment * [refactor] removed comment; removed "split" parameter in gen_kwargs --------- Co-authored-by: Amir Djanibekov * Update constants.py * Close #306 | Create dataset loader for ViHealthQA (#319) * Create dataset loader for ViHealthQA #306 * add class docstring * Update vihealthqa.py * Closes #10 | Create beaye_lexicon dataset loader (#320) * Create beaye_lexicon dataset loader * add implementation of eng-day word pairs * Closes #179 | Implement `indo_story_cloze` dataloader (#323) * Implement indo_story_cloze dataloader. * correct license * update according to the feedback * update * Closes #353| Create dataset loader for FilWordNet (#377) * Add dataloader for FilWordNet * Update seacrowd/sea_datasets/filwordnet/filwordnet.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Update seacrowd/sea_datasets/filwordnet/filwordnet.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Fix formatting --------- Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * feat: id_sentiment_analysis dataloader * refactor: remove print * refactor: default config name * feat: subsets * Closes #350 | Implement dataloader for Indonesian PRONER (#399) * Implement dataloader for Indonesian PRONER * Add manual and automatic subsets --------- Co-authored-by: Railey Montalan * Implement dataloader for IMAD Malay Corpus (#402) Co-authored-by: ssfei81 * Update id_wsd.py * add thaigov (#412) * add thaigov * Update thaigov.py * add inline comment for file structure * Update and rename snli.py to snli_indo.py * Rename SNLI to SNLI Indo * Update snli_indo.py * [add] dataloader for sarawak_malay * Closes #264 | Create dataset loader for mySentence #264 (#291) * add mysentences dataloader * align the config name to subset_id * update mysentence config * Update mysentence.py * remove comment line * Update mysentence.py * Update mysentence config * Update mysentence.py * Update seacrowd/sea_datasets/mysentence/mysentence.py Fix the subset_id case-checking for data download * added __init__.py to ucla_phonetic * updated dataloader according to suggestions * Update memolon.py * fix: subset_id format * refactor: prepend dataset name to subset id * fix: first language is set to latin english * Add thai depression * Create __init__.py * Create __init__.py * Create __init__.py * Implement dataloader for SeaEval * Update template.py instruction for dataloader class name (#334) * Add documentation for dataloader class name * Update template.py * Update REVIEWING.md This modified the content of adding "Dataset" suffix into optional, and giving a reference to templates/templates.py for example * Update REVIEWING.md fix file reference name --------- Co-authored-by: Salsabil Maulana Akbar * Closes #165 | Add BLOOM-LM dataset (#294) * Init add BLOOM-LM dataset * Adjusting changes based on review * fix typing on _generate_examples * update import based on formatter suggestion * Closes #349 | Create dataset loader for QASiNa (#418) * [add] dataloader for qasina * [refactor] renamed dataset class * [add] added contex_title to qa_seacrowd schema * [refactor, add] changed QA type, added "answer_start", "contx_length" information to meta * [refactor] bug fixes --------- Co-authored-by: Amir Djanibekov * Closes #263 | Implement dataloader for VIVOS (#398) * Implement dataloader for * Implement dataloader for VIVOS * Add missing __init__.py file * Change _LANGUAGES into list --------- Co-authored-by: Railey Montalan * Closes #190 | Create dataset loader for TydiQA (#251) * add tydiqa dataloader * Update tydiqa.py * add example helper and update config * Update tydiqa.py * Update Configs and _info * Update features in _info() * Update tydiqa.py This update covers the requested changes from @jen-santoso and @jamesjaya, please advice if needs any further changes. Thanks. * add tydiqa_id subset * Update tydiqa.py Reformat long lines in the code and add IndoNLG in citation * remove tydiqa_id * Closes #338 | Created DataLoader for IndonesianNMT (#367) * Implementing Dataloader for indonesiannmt issue #338 * Update template.py * Implementing Dataloader for indonesiannmt issue #338 * removed if __main__ section * IndonesianNMT reconstructing dataloader * Implement ssp task, implement suggestions * format indonesiannmt --------- Co-authored-by: Holy Lovenia Co-authored-by: Jonibek Mansurov <44943993+MJonibek@users.noreply.github.com> * Closes #366 | Implement dataloader for Kheng.info Speech (#401) * Implement dataloader for Kheng.info Speech * Add init file * Closes #226 | Vi Pubmed dataloader (#391) * feat: vi_pubmed dataloader * fix: homepage * fix: non unique id error * refactor: class name * refactor: remove unused loop * Create __init__.py * [refactor] removed comment * Update flores200.py * refactor: remove main function * Closes #69 | Implement XStoryCloze Dataloader (#137) * implement xstorycloze dataloader * add __init__.py * update * remove ssp schema; add _LANGUAGES * remove unnecessary import; pascal case for class name * Closes #147 | implemented dataloader for gatitos dataset (#415) * implemented dataloader for gatitos dataset * added __init__.py to gatitos folder * Updated gatitos --------- Co-authored-by: ssfei81 * Update CODEOWNERS * Patch Workflow on Stale Checking (#482) * Update stale.yml * Create add-new-comment-on-stale * Update and rename stale.yml to stale-labeler.yml * Update add-new-comment-on-stale * Rename add-new-comment-on-stale to add-new-comment-on-stale.yml * Sabilmakbar Patch Workflow (#484) Bugfix on #482. * Update add-new-comment-on-stale.yml add workflow trigger criteria on PR message aswell * Update add-new-comment-on-stale.yml * Update add-new-comment-on-stale.yml fix yaml indent * Update add-new-comment-on-stale.yml * Closes #340 | Implement Dataloader for emotes_3k (#397) * Implement Dataloader for emotes_3k * Implement Dataloader for emotes_3k * Tasks updated from sentiment analysis to morality classification * Implement Change Request * formatting emotes_3k --------- Co-authored-by: Jonibek Mansurov <44943993+MJonibek@users.noreply.github.com> * refactor: remove main function Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Update constants.py * Closes #311 | Add dataloader for indonesian_madurese_bible_translation (#337) * add dataloader for indonesian_madurese_bible_translation * update the license of indonesian_madurese_bible_translation * Update indonesian_madurese_bible_translation.py * modify based on comments from holylovenia * [indonesian_madurese_bible_translation] * update based on the reviewer's comments * Remove `CONTRIBUTING.md`, update PR Message Template, and add bash to initialize dataset (#468) * add bash to initialize dataset * delete CONTRIBUTING.md since it's duplicated with DATALOADER.md * update the docs slightly on suggesting new dataloader contributors to use template * fix few wordings * Add info on required vars '_LOCAL' * Add checklist on __init__.py * fix wording on 2nd checklist regarding 'my_dataset' that should've been a var instead of static val * fix wordings on first section of PR msg * add newline separator for better readability * add info on some to-dos * refactor: citation * Closes #83 | Implement Dataloader for GlobalWoZ (#261) * refactor by pre-commit * reformatted by pre-commit * refactor code for globalwoz * Create dataset loader for IndoQA #430 (#431) * Add CODE_SWITCHING_IDENTIFICATION task (#488) * Closes #396 | Implement dataloader for CrossSum (#419) * Implement dataloader * Change to 3-letter ISO codes * Change task to CROSS_LINGUAL_SUMMARIZATION * Closes #92 | Create Jail break data loader (#390) * feat: jailbreak dataloader * fix: minor errors * refactor: styling * refactor: remove main entry * refactor: class name * refactor: remove unused loop * fix: separate text column into different subsets * Create __init__.py * Implement CommonVoice 12.0 dataloader (#452) * Closes #202 | Implement dataloader for WIT (#374) * Implement dataloader for WIT * Remove unnecessary commits * Add to description --------- Co-authored-by: Railey Montalan * Split into language subsets * Split into language subsets * Update seacrowd/sea_datasets/thai_depression/thai_depression.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * fix: change lincense to unknown * fix: minor errors * Closes #80 | Implement MSVD-Indonesian Dataloader (#135) * implement id_msvd dataloader * change logic for seacrowd schema (text first, then video); quality of life change to video schema * revert seacrowd video key from "text" to "texts" * change source logic to match original data implementation * run make check_file * Closes #34 | Create dataset loader for MKQA (#177) * Create dataset loader for MKQA #34 * Refactor class variables _LANGUAGES to global for MKQA #34 * Filter supported languages (SEA only) of seacrowd_qa schema for MKQA #34 * Filter supported languages (SEA only) of source schema for MKQA #34 * Filter supported languages (SEA only) for MKQA #34 (a leftover) * Change language code from macrolanguage, msa to zlm, for MKQA #34 * Change to a more appropriate language code of for Malaysian variant used in MKQA #34 * Changed the value of field 'type' of QA schema to be more general, and moved the more specific value to 'meta' field for MKQA #34 * Replace None value to empty array in 'answer_aliases' sub-field for consistency in MKQA #34 * Closes #193 | Create dataset loader for MALINDO Morph (#332) * Implement dataloader for MALINDO morph * Specify file encoding and remove newlines when loading data * Add blank __init__.py * Fix typos in docstring * Fix typos * Update seacrowd/sea_datasets/malindo_morph/malindo_morph.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Update seacrowd/sea_datasets/malindo_morph/malindo_morph.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Update seacrowd/sea_datasets/malindo_morph/malindo_morph.py --------- Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * fix: subsets * Closes #314 | Add dataloader for Indonesia chinese mt robust eval (#388) * add dataloader for indonesian_madurese_bible_translation * update dataloader for indonesia_chinese_mtrobusteval * Delete seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py * Update indonesia_chinese_mtrobusteval.py * update code based on the reviewer comments * add __init__.py * Update seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py * Update seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py --------- Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * refactor: feature naming Co-authored-by: Salsabil Maulana Akbar * fix: homepage url * Closes #211 | Implement dataloader for SEAHORSE (#407) * implement seahorse dataloader * update * update * incorporate the latest comments though tensorflow still needed for tfds * update * update * fix: lowercase feature name * refactor: subset name * fix: limit the sentence paths to the relevant languages * refactor: remove possible error * Change default split to TEST * Closes #447 | Create dataset loader for Aya Dataset (#457) * Implementing data loader for Aya Dataset * Fixing license serialization issue * Update based on formatter for aya_dataset.py * update xlsum to extend more langs * update based on formatter * Closes #360 | Implement dataloader for khpos (#376) * Implement dataloader for khpos * Remove unneeded comment * Implemented Test and Validation loading * Streamlining code * Closes #116 | Add pho_ner_covid Dataloader (#461) * feat: pho_ner_covid dataloader * refactor: classname Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * fix: remove main function Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * refactor: remove inplace uses for dataframe * refactor: remove duplicate statement --------- Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * refactor: remove trailing spaces Co-authored-by: Salsabil Maulana Akbar * refactor: url format * edit 'texts' to 'text' key (#499) * Closes #217 | Implement dataloader for `wili_2018` (#381) * Implement dataloader for wili_2018 * update * Closes #104 | Add lazada_review_filipino (#409) * Add lazada_review_filipino Closes #104 * Update lazada_review_filipino.py Update config name * Update lazada_review_filipino.py fix typo * Update lazada_review_filipino.py bug fix - ValueError: Class label 5 greater than configured num_classes 5 * Update seacrowd/sea_datasets/lazada_review_filipino/lazada_review_filipino.py --------- Co-authored-by: Samuel Cahyawijaya Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Adjust bash script test_example.sh and test_example_source_only.sh (#171) * update: adjust test_example.sh and test_example_source_only.sh * fix: minor error message when dataset is empty * updated kde4 language codes to iso639-3 * fix: citation * refactor: use base config class * create dataset loader for myanmar-rakhine parallel (#471) * add pyreadr==0.5.0 (#504) usage: reads/writes R RData and Rds files into/from pandas data frames * Closes #97 | Inter-Agency Task Force for the Management of Emerging Infectious Diseases (IATF) COVID-19 Resolutions (#460) * Closes #274 | Create OIL data loader (#389) * initial commit * refactor: move module * feat: dataset implementation * feat: oil dataloader * refactor: move dataloader file * refactor: move dataloader file * fix: non unique id error * refactor: file formating * refactor: remove comments * fix: invalid config name exception raise * refactor: audio cache file path * fix: remove useless loop * refactor: formatting * Create __init__.py * fix: citation * fix: remove seacrowd schema * Closes #49 | Updated existing TICO_19 dataloader to support more sea languages (#414) * Updated existing TICO_19 dataloader to support more sea languages * added sea languages to _LANGUAGES --------- Co-authored-by: ssfei81 * Closes #443 | Add dataloader for ASR-STIDUSC (#493) * Add dataloader for ASR-STIDUSC * update task, dataset name, pythonic coding * add relation extraction task (#502) * fix: subset and config name * Update bibtex id * Closes #356 | Implement dataloader for CodeSwitch-Reddit (#451) * Add CODE_SWITCHING_IDENTIFICATION task * Implement dataloader * Update codeswitch_reddit.py fix column naming in source (using lowercase instead of capitalized) * Closes #222 | Create dataset loader for CreoleRC (#469) * Create dataset loaderfor CreoleRC * remove changes to constants.py * remove document_id, add normalized, add sanity check on offset value * Update REVIEWING.md Clarify wording in Dataloader Reviewing Doc * Closes #341 | Create dataset loader for myParaphrase (#436) * [add] dataloader for my_paraphrase * [refactor] removed redundant breakpoint; put right default schema function * [refactor] changed schema for dataset * [refactor] split data into 3 categories(paraphrase, non_paraphrase, all) * [refactor] default config name is changed * [refactor] source configs for _paraphrase,_non_paraphrase,_all; altered schema naming * [refactor] cleaner conditioning, defined else clause * Closes #269 | Create dataset loader for ViVQA #269 (#318) * add vivqa dataloader * Update vivqa.py * update viviq dataloader config * Update vivqa.py * add vivqa dataloader * Update vivqa.py * update viviq dataloader config * Update vivqa.py * Update vivqa.py * update * Update vivqa.py * Update vivqa.py * Delete .idea/vcs.xml * Delete .idea/seacrowd-datahub.iml * Delete .idea/inspectionProfiles/profiles_settings.xml * Delete .idea/inspectionProfiles/Project_Default.xml * Update vivqa.py * Revert "Merge branch 'vivqa' of github.com:gyyz/seacrowd-datahub into vivqa" This reverts commit a96fa802359654d28891c653d24ce155073a6c65, reversing changes made to 23700ca77ff060915a19d5af09ca578fb41bd71f. * Delete .idea/vcs.xml * Delete .idea/seacrowd-datahub.iml * Delete .idea/inspectionProfiles/profiles_settings.xml * Delete .idea/inspectionProfiles/Project_Default.xml * Revert "Merge branch 'vivqa' of github.com:gyyz/seacrowd-datahub into vivqa" This reverts commit a96fa802359654d28891c653d24ce155073a6c65, reversing changes made to 23700ca77ff060915a19d5af09ca578fb41bd71f. * Revert "Revert "Merge branch 'vivqa' of github.com:gyyz/seacrowd-datahub into vivqa"" This reverts commit 5f1a3d69aaff1b0f5c2a1be6e26913ad706bcee2. * fixing trailing space and run Makefile * Closes #445 | Create dataset loader for malaysia-tweets-with-sentiment-labels (#450) * Fix typo syntax dictionary at constants.py * Add dataloader for malaysia_tweets * Completed requested changes * add dataloader for ASR-Sindodusc (#491) * Closes #475 | Add dataloader for indonglish-dataset (#490) * create dataloader for indonglish * make subset_id unique, use ClassLabel for label * Closes #215 | Implement dataloader for `thai_gpteacher` (#382) * Implement dataloader for thai_gpteacher * update * update * Closes #275 | Create dataset loader for UIT-ViCoV19QA #275 (#463) * add SeaCrowd dataloader for uit_vicov19qa * Merge subsets to one * remove unused imported package * Closes #309 | Create dataset loader for Vietnamese Hate Speech Detection (UIT-ViHSD) #309Uit vihsd (#501) * create dataloader for uit_vihsd * Update uit_vihsd.py * Add some info for the labels * Update example for Seacrowd schema * Closes #441 | Add dataloader for ASR-SMALDUSC (#492) * Add dataloader for ASR-SMALDUSC * add prompt field * Closes #307 | Implement dataloader for ViSoBERT (#466) * Update constants.py * Implement dataloader for ViSoBERT * Fix conflicts with constants.py * Combine source and seacrowd_ssp schemas --------- Co-authored-by: Holy Lovenia Co-authored-by: Railey Montalan * add dataloader for wikitext_tl_39 (#486) * Closes #393 | Create dataset loader for WEATHub (#496) * [Feature] Add Weathub DataLoader * [Fix] Add filter for SEA languages only + add constants + run formatter * [Chore] Fix data loader naming * [Fix] Impelement request changes from review * Closes #188 | Implement dataloader for Sea-bench (#375) * Implement dataloader for WIT * Implement dataloader for sea_bench * Remove WIT * Remove logger and unnecessary variables * Add instruction tuning and remove QA and summarization tasks * Add __init__.py file * Remove machine translation task * Fix nitpicks --------- Co-authored-by: Railey Montalan * Closes #115 | Create dataset loader for PhoMT dataset (#489) * add dataloader for PhoMT dataset * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco * update text1/2 name for PhoMT dataset * Update phomt.py to replace en&vi to eng&vie --------- Co-authored-by: Elyanah Aco * Closes #310 |Create dataset loader for ViSpamReviews #310 (#454) * add vispamreviews dataloader * update vispamreviews * update schema * Closes #530 | Add/Update Dataloader Tatabahasa (#540) * feat: dataloader QA commonsense-reasoning * nitpick * Closes #267 | Add dataloader for struct_amb_ind (#506) * Implement dataloader for struct_amb_ind * Update seacrowd/sea_datasets/struct_amb_ind/struct_amb_ind.py Co-authored-by: Jonibek Mansurov <44943993+MJonibek@users.noreply.github.com> --------- Co-authored-by: Jonibek Mansurov <44943993+MJonibek@users.noreply.github.com> * Closes #347 | Create dataset loader for IndoWiki (#485) * create dataset loader for IndoWiki * remove seacrowd schema * Closes #354 | Implement dataloader for ETOS (#416) * Implement dataloader for ETOS * Implement dataloader for ETOS * Rename dataset class name to ETOSDataset * Remove schema due to insufficient annotations * Change ETOS into a POS tagging dataset * Add missing __init__.py file * Fix nitpicks * Add DEFAULT_CONFIG_NAME --------- Co-authored-by: Railey Montalan * update common_parser for UD JV_CSUI (#558) * Create dataset loader for UD Javanese-CSUI #427 (#432) * Closes #446 | Add/Update Dataloader voxlingua (#543) * add init voxlingua * Update seacrowd/sea_datasets/voxlingua/voxlingua.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> --------- Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Closes #428 | Create dataset loader for Indonesia BioNER (#434) * Update seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Update seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Update seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Update seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Update cc3m_35l.py Changed "_LANGS" to "_LANGUAGES" * init commit * Update seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Update seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Update seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> * Closes #344 | Create dataset loader for VLSP2016-SA (#500) * [add] dataloader for vlsp2016_sa[local] * [refactor] changed schema name --------- Co-authored-by: Amir Djanibekov * Fix the private datasheet link in POINTS.md (#568) * Closes #192 | Create dataset loader for MALINDO_parallel (#385) * add malindo_parallel.py * cleanup * Class name fix Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Remove sample licenses Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * fix dataset formatting error, use original dataset id --------- Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> * Closes #114 | Implement dataloader for VnDT (#467) * Implement dataloader for VnDT * Add utility to impute missing sent_id and text fields from CoNLL files * Fix imputed outputs --------- Co-authored-by: Railey Montalan * add ocr task (#555) * PR for update subset composition of TydiQA | Close #465 (#503) * update csubset composition * Update Subset Composition * Update Subset Composition * update subset name indonesian --> ind thai --> tha * Update nusaparagraph_emot.py * Update nusaparagraph_emot.py * Update configs.py * Closes #346 | Implement dataloader for MUSE (Multilingual Unsupervised and Supervised Embeddings) (#406) * Implement dataloader for MUSE (Multilingual Unsupervised and Supervised Embeddings) * Create __init__.py for MUSE #346 * Remove unused comment lines for MUSE #346 * changed all 2 letters language codes to 3 letters --------- Co-authored-by: ssfei81 Co-authored-by: Frederikus Hudi * Closes #12 | Add/Update Dataloader BalitaNLP (#550) * Implement dataloader for balita_nlp * Remove articles with missing images from imtext schema * Add details to metadata * Adding New Citation for Bhinneka korpus (#599) * Add bhinnek_korpus dataset loader * Updating the suggested changes * Resolved review suggestions * adding new citation --------- Co-authored-by: Holy Lovenia * Closes #270 | Create dataset loader for OpenViVQA #270 (#464) * add sample * init submit for openvivqa dataloader * Update openvivqa.py * Update openvivqa.py * update dict format * Closes #516 | Add/Update Dataloader id_newspaper_2018 (#551) * Implement dataloader for id_newspaper_2018 * Specify JSON ecoding * Closes #429 | Implement dataloader for filipino_hatespeech_election (#487) * Add dataloader for filipino_hatespeech_election * update task * update * Closes #52 | Add cosem dataloader (#473) * feat: cosem dataloader * fix: citation * refactor: dataloader class name * fix: file parsing logic * fix: id format * fix: tab separator bug in text * fix: check for unique id * Closes #424 | Add Dataloader Bactrian-X * Import `schemas` beforehand on `templates/template.py` (#644) * add import statement for schemas * add import statement for schemas * Closes #313 | Add dataloader for Saltik (#387) * add dataloader for indonesian_madurese_bible_translation * add dataloader for saltik * Delete seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py * update based on the reviewer comment * update based on the reviewer comment * Remove the modified constants.py from PR --------- Co-authored-by: Holy Lovenia * Add `.upper` method for `--schema` parameter (#648) * add upper method for --schema * revert code-style * Closes #438 | Add dataloader for ASR-INDOCSC (#509) * add dataloader for asr_indocsc * Update asr_indocsc.py for data downloading instructions --------- Co-authored-by: Salsabil Maulana Akbar Co-authored-by: Elyanah Aco Co-authored-by: Yuze GAO Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Co-authored-by: XU, Yan (Yana) Co-authored-by: Haochen Li Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com> Co-authored-by: Holy Lovenia Co-authored-by: Lucky Susanto Co-authored-by: Samuel Cahyawijaya Co-authored-by: Muhammad Dehan Al Kautsar <68471412+dehanalkautsar@users.noreply.github.com> Co-authored-by: Lj Miranda Co-authored-by: Lucky Susanto <78904597+luckysusanto@users.noreply.github.com> Co-authored-by: Maria Khelli <71018093+khelli07@users.noreply.github.com> Co-authored-by: Ishan Jindal Co-authored-by: ssfei81 Co-authored-by: IvanHalimP Co-authored-by: Enliven26 <16521443@mahasiswa.itb.ac.id> Co-authored-by: Dan John Velasco <54620209+danjohnvelasco@users.noreply.github.com> Co-authored-by: Chenxi Co-authored-by: Bhavish Pahwa <53102161+bp-high@users.noreply.github.com> Co-authored-by: FawwazMayda <33770567+FawwazMayda@users.noreply.github.com> Co-authored-by: fawwaz.mayda Co-authored-by: Ilham F Putra <31740013+ilhamfp@users.noreply.github.com> Co-authored-by: rafif-kewmann Co-authored-by: mrafifrbbn Co-authored-by: Yong Zheng-Xin Co-authored-by: Amir Djanibekov <45315801+djanibekov@users.noreply.github.com> Co-authored-by: Amir Djanibekov Co-authored-by: joan <68073738+joanitolopo@users.noreply.github.com> Co-authored-by: joanitolopo Co-authored-by: Railey Montalan Co-authored-by: Railey Montalan Co-authored-by: ssun32 <31747965+ssun32@users.noreply.github.com> Co-authored-by: Tyson <784250886@qq.com> Co-authored-by: Ilham Firdausi Putra Co-authored-by: Johanes Lee <89065724+Enliven26@users.noreply.github.com> Co-authored-by: Akhdan Fadhilah Co-authored-by: Frederikus Hudi Co-authored-by: Börje Karlsson Co-authored-by: Muhammad Satrio Wicaksono Co-authored-by: Wenyu Zhang <7636719+zwenyu@users.noreply.github.com> Co-authored-by: R. Damanhuri <96944447+R-Damanhuri@users.noreply.github.com> Co-authored-by: Patrick Amadeus Irawan Co-authored-by: Reza Qorib Co-authored-by: Bryan Wilie Co-authored-by: Muhammad Ravi Shulthan Habibi --- .github/CODEOWNERS | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 14 +- .../workflows/add-new-comment-on-stale.yml | 43 ++ .../{stale.yml => stale-labeler.yml} | 14 +- CONTRIBUTING.md | 208 ------- DATALOADER.md | 18 +- POINTS.md | 4 +- README.md | 7 +- REVIEWING.md | 48 ++ requirements.txt | 5 + .../{tydiqa_id => abui_wordnet}/__init__.py | 0 .../sea_datasets/abui_wordnet/abui_wordnet.py | 146 +++++ .../alt_burmese_treebank/__init__.py | 0 .../alt_burmese_treebank.py | 151 +++++ .../alt_burmese_treebank/utils/__init__.py | 0 .../utils/alt_burmese_treebank_utils.py | 70 +++ seacrowd/sea_datasets/ara_close/__init__.py | 0 seacrowd/sea_datasets/ara_close/ara_close.py | 194 ++++++ seacrowd/sea_datasets/asr_indocsc/__init__.py | 0 .../sea_datasets/asr_indocsc/asr_indocsc.py | 192 ++++++ .../sea_datasets/asr_sindodusc/__init__.py | 0 .../asr_sindodusc/asr_sindodusc.py | 180 ++++++ .../sea_datasets/asr_smaldusc/__init__.py | 0 .../sea_datasets/asr_smaldusc/asr_smaldusc.py | 182 ++++++ seacrowd/sea_datasets/asr_stidusc/__init__.py | 0 .../sea_datasets/asr_stidusc/asr_stidusc.py | 178 ++++++ .../audio_keyword_spotting/__init__.py | 0 .../audio_keyword_spotting.py | 198 ++++++ seacrowd/sea_datasets/aya_dataset/__init__.py | 0 .../sea_datasets/aya_dataset/aya_dataset.py | 188 ++++++ seacrowd/sea_datasets/bactrian_x/__init__.py | 0 .../sea_datasets/bactrian_x/bactrian_x.py | 153 +++++ seacrowd/sea_datasets/balita_nlp/__init__.py | 0 .../sea_datasets/balita_nlp/balita_nlp.py | 229 +++++++ .../sea_datasets/beaye_lexicon/__init__.py | 0 .../beaye_lexicon/beaye_lexicon.py | 116 ++++ seacrowd/sea_datasets/belebele/__init__.py | 0 .../sea_datasets/bhinneka_korpus/__init__.py | 0 .../bhinneka_korpus/bhinneka_korpus.py | 139 +++++ seacrowd/sea_datasets/bioner_id/__init__.py | 0 seacrowd/sea_datasets/bioner_id/bioner_id.py | 166 +++++ .../sea_datasets/bloom_captioning/__init__.py | 0 .../bloom_captioning/bloom_captioning.py | 248 ++++++++ seacrowd/sea_datasets/bloom_lm/__init__.py | 0 seacrowd/sea_datasets/bloom_lm/bloom_lm.py | 247 ++++++++ .../sea_datasets/bloom_speech/__init__.py | 0 .../sea_datasets/bloom_speech/bloom_speech.py | 172 ++++++ seacrowd/sea_datasets/bloom_vist/__init__.py | 0 .../sea_datasets/bloom_vist/bloom_vist.py | 262 ++++++++ seacrowd/sea_datasets/burapha_th/__init__.py | 0 .../sea_datasets/burapha_th/burapha_th.py | 167 +++++ seacrowd/sea_datasets/cc100/cc100.py | 67 +- seacrowd/sea_datasets/cc3m_35l/__init__.py | 0 seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py | 242 ++++++++ .../sea_datasets/cc_aligned_doc/__init__.py | 0 .../cc_aligned_doc/cc_aligned_doc.py | 154 +++++ .../sea_datasets/cc_aligned_sent/__init__.py | 0 .../cc_aligned_sent/cc_aligned_sent.py | 167 +++++ seacrowd/sea_datasets/coco_35l/__init__.py | 0 seacrowd/sea_datasets/coco_35l/coco_35l.py | 228 +++++++ .../codeswitch_reddit/__init__.py | 0 .../codeswitch_reddit/codeswitch_reddit.py | 209 +++++++ .../sea_datasets/commonvoice_120/__init__.py | 0 .../commonvoice_120/commonvoice_120.py | 208 +++++++ seacrowd/sea_datasets/cosem/__init__.py | 0 seacrowd/sea_datasets/cosem/cosem.py | 173 ++++++ seacrowd/sea_datasets/creole_rc/__init__.py | 0 seacrowd/sea_datasets/creole_rc/creole_rc.py | 207 +++++++ seacrowd/sea_datasets/crosssum/__init__.py | 0 seacrowd/sea_datasets/crosssum/crosssum.py | 140 +++++ .../sea_datasets/cub_bahasa/cub_bahasa.py | 3 + seacrowd/sea_datasets/culturax/culturax.py | 4 + .../sea_datasets/dengue_filipino/__init__.py | 0 .../dengue_filipino/dengue_filipino.py | 134 ++++ seacrowd/sea_datasets/emotes_3k/__init__.py | 0 seacrowd/sea_datasets/emotes_3k/emotes_3k.py | 238 +++++++ seacrowd/sea_datasets/etos/__init__.py | 0 seacrowd/sea_datasets/etos/etos.py | 205 +++++++ .../filipino_gay_lang/__init__.py | 0 .../filipino_gay_lang/filipino_gay_lang.py | 115 ++++ .../filipino_hatespeech_election/__init__.py | 0 .../filipino_hatespeech_election.py | 124 ++++ seacrowd/sea_datasets/filwordnet/__init__.py | 0 .../sea_datasets/filwordnet/filwordnet.py | 146 +++++ seacrowd/sea_datasets/flores200/__init__.py | 0 seacrowd/sea_datasets/flores200/flores200.py | 475 ++++++++++++++ seacrowd/sea_datasets/fsl_105/__init__.py | 0 seacrowd/sea_datasets/fsl_105/fsl_105.py | 191 ++++++ seacrowd/sea_datasets/gatitos/__init__.py | 0 seacrowd/sea_datasets/gatitos/gatitos.py | 140 +++++ .../sea_datasets/gklmip_newsclass/__init__.py | 0 .../gklmip_newsclass/gklmip_newsclass.py | 171 ++++++ .../sea_datasets/gklmip_sentiment/__init__.py | 0 .../gklmip_sentiment/gklmip_sentiment.py | 144 +++++ seacrowd/sea_datasets/globalwoz/__init__.py | 0 seacrowd/sea_datasets/globalwoz/globalwoz.py | 226 +++++++ .../glotstorybook/glotstorybook.py | 7 +- seacrowd/sea_datasets/gnome/__init__.py | 0 seacrowd/sea_datasets/gnome/gnome.py | 190 ++++++ seacrowd/sea_datasets/iapp_squad/__init__.py | 0 .../sea_datasets/iapp_squad/iapp_squad.py | 128 ++++ seacrowd/sea_datasets/iatf/__init__.py | 0 seacrowd/sea_datasets/iatf/iatf.py | 183 ++++++ seacrowd/sea_datasets/icon/__init__.py | 0 seacrowd/sea_datasets/icon/icon.py | 216 +++++++ .../id_coreference_resolution/__init__.py | 0 seacrowd/sea_datasets/id_msvd/__init__.py | 0 seacrowd/sea_datasets/id_msvd/id_msvd.py | 134 ++++ .../id_newspaper_2018/__init__.py | 0 .../id_newspaper_2018/id_newspaper_2018.py | 150 +++++ .../id_sent_emo_mobile_apps/__init__.py | 0 .../id_sent_emo_mobile_apps.py | 136 ++++ .../id_sentiment_analysis/__init__.py | 0 .../id_sentiment_analysis.py | 162 +++++ .../id_vaccines_tweets/__init__.py | 0 .../id_vaccines_tweets/id_vaccines_tweets.py | 113 ++++ seacrowd/sea_datasets/id_wsd/__init__.py | 0 seacrowd/sea_datasets/id_wsd/id_wsd.py | 10 +- .../identifikasi_bahasa/__init__.py | 0 .../identifikasi_bahasa.py | 136 ++++ .../sea_datasets/ind_proner/ind_proner.py | 191 ++++++ .../sea_datasets/indo_story_cloze/__init__.py | 0 .../indo_story_cloze/indo_story_cloze.py | 179 ++++++ seacrowd/sea_datasets/indocamrest/__init__.py | 0 .../sea_datasets/indocamrest/indocamrest.py | 163 +++++ seacrowd/sea_datasets/indoler/__init__.py | 0 seacrowd/sea_datasets/indoler/indoler.py | 214 +++++++ seacrowd/sea_datasets/indommlu/__init__.py | 0 seacrowd/sea_datasets/indommlu/indommlu.py | 291 +++++++++ .../sea_datasets/indoner_tourism/__init__.py | 0 .../indoner_tourism/indoner_tourism.py | 183 ++++++ .../__init__.py | 0 .../indonesia_chinese_mtrobusteval.py | 151 +++++ .../__init__.py | 0 .../indonesian_madurese_bible_translation.py | 180 ++++++ .../indonesian_news_dataset/__init__.py | 0 .../indonesian_news_dataset.py | 128 ++++ .../sea_datasets/indonesiannmt/__init__.py | 0 .../indonesiannmt/indonesiannmt.py | 216 +++++++ seacrowd/sea_datasets/indonglish/__init__.py | 0 .../sea_datasets/indonglish/indonglish.py | 216 +++++++ seacrowd/sea_datasets/indoqa/__init__.py | 0 seacrowd/sea_datasets/indoqa/indoqa.py | 152 +++++ seacrowd/sea_datasets/indosmd/__init__.py | 0 seacrowd/sea_datasets/indosmd/indosmd.py | 273 +++++++++ seacrowd/sea_datasets/indowiki/__init__.py | 0 seacrowd/sea_datasets/indowiki/indowiki.py | 198 ++++++ seacrowd/sea_datasets/kawat/__init__.py | 0 seacrowd/sea_datasets/kde4/__init__.py | 0 seacrowd/sea_datasets/kde4/kde4.py | 574 +++++++++++++++++ seacrowd/sea_datasets/kheng_info/__init__.py | 0 .../sea_datasets/kheng_info/kheng_info.py | 113 ++++ seacrowd/sea_datasets/khpos/__init__.py | 0 seacrowd/sea_datasets/khpos/khpos.py | 212 +++++++ .../lazada_review_filipino/__init__.py | 0 .../lazada_review_filipino.py | 147 +++++ seacrowd/sea_datasets/lr_sum/__init__.py | 0 seacrowd/sea_datasets/lr_sum/lr_sum.py | 166 +++++ seacrowd/sea_datasets/m3exam/__init__.py | 0 seacrowd/sea_datasets/m3exam/m3exam.py | 308 ++++++++++ .../sea_datasets/malaysia_tweets/__init__.py | 0 .../malaysia_tweets/malaysia_tweets.py | 152 +++++ .../sea_datasets/malindo_morph/__init__.py | 0 .../malindo_morph/malindo_morph.py | 124 ++++ .../sea_datasets/malindo_parallel/__init__.py | 0 .../malindo_parallel/malindo_parallel.py | 196 ++++++ seacrowd/sea_datasets/massive/__init__.py | 0 seacrowd/sea_datasets/massive/massive.py | 580 ++++++++++++++++++ .../sea_datasets/melayu_brunei/__init__.py | 0 .../melayu_brunei/melayu_brunei.py | 197 ++++++ .../sea_datasets/melayu_sabah/__init__.py | 0 .../sea_datasets/melayu_sabah/melayu_sabah.py | 148 +++++ .../sea_datasets/melayu_sarawak/__init__.py | 0 .../melayu_sarawak/melayu_sarawak.py | 146 +++++ .../melayu_standard_lisan/__init__.py | 0 .../melayu_standard_lisan.py | 162 +++++ seacrowd/sea_datasets/memolon/__init__.py | 0 seacrowd/sea_datasets/memolon/memolon.py | 142 +++++ seacrowd/sea_datasets/miracl/__init__.py | 0 seacrowd/sea_datasets/miracl/miracl.py | 288 +++++++++ seacrowd/sea_datasets/mkqa/__init__.py | 0 seacrowd/sea_datasets/mkqa/mkqa.py | 227 +++++++ seacrowd/sea_datasets/mlqa/mlqa.py | 1 + .../sea_datasets/mozilla_pontoon/__init__.py | 0 .../mozilla_pontoon/mozilla_pontoon.py | 171 ++++++ seacrowd/sea_datasets/mswc/__init__.py | 0 seacrowd/sea_datasets/mswc/mswc.py | 219 +++++++ .../mtop_intent_classification/__init__.py | 0 .../mtop_intent_classification/labels.py | 126 ++++ .../mtop_intent_classification.py | 135 ++++ seacrowd/sea_datasets/muse/__init__.py | 0 seacrowd/sea_datasets/muse/muse.py | 197 ++++++ .../sea_datasets/my_paraphrase/__init__.py | 0 .../my_paraphrase/my_paraphrase.py | 200 ++++++ .../myanmar_rakhine_parallel/__init__.py | 0 .../myanmar_rakhine_parallel.py | 179 ++++++ seacrowd/sea_datasets/mysentence/__init__.py | 0 .../sea_datasets/mysentence/mysentence.py | 170 +++++ seacrowd/sea_datasets/myxnli/__init__.py | 0 seacrowd/sea_datasets/myxnli/myxnli.py | 143 +++++ seacrowd/sea_datasets/newsph/__init__.py | 0 seacrowd/sea_datasets/newsph/newsph.py | 109 ++++ seacrowd/sea_datasets/ntrex_128/__init__.py | 0 seacrowd/sea_datasets/ntrex_128/ntrex_128.py | 444 ++++++++++++++ .../nusaparagraph_emot/nusaparagraph_emot.py | 2 +- seacrowd/sea_datasets/oil/__init__.py | 0 seacrowd/sea_datasets/oil/oil.py | 149 +++++ seacrowd/sea_datasets/openlid/__init__.py | 0 seacrowd/sea_datasets/openlid/openlid.py | 140 +++++ seacrowd/sea_datasets/openslr/__init__.py | 0 seacrowd/sea_datasets/openslr/openslr.py | 258 ++++++++ seacrowd/sea_datasets/openvivqa/__init__.py | 0 seacrowd/sea_datasets/openvivqa/openvivqa.py | 162 +++++ seacrowd/sea_datasets/orchid_pos/__init__.py | 0 .../sea_datasets/orchid_pos/orchid_pos.py | 272 ++++++++ .../sea_datasets/oscar_2201/oscar_2201.py | 3 + seacrowd/sea_datasets/palito/__init__.py | 0 seacrowd/sea_datasets/palito/palito.py | 160 +++++ .../ph_fake_news_corpus/__init__.py | 0 .../ph_fake_news_corpus.py | 115 ++++ .../sea_datasets/pho_ner_covid/__init__.py | 0 .../pho_ner_covid/pho_ner_covid.py | 203 ++++++ .../sea_datasets/phoatis/intent_label.txt | 29 + seacrowd/sea_datasets/phoatis/phoatis.py | 239 ++++++++ seacrowd/sea_datasets/phoatis/slot_label.txt | 150 +++++ seacrowd/sea_datasets/phomt/__init__.py | 0 seacrowd/sea_datasets/phomt/phomt.py | 139 +++++ seacrowd/sea_datasets/prdect_id/__init__.py | 0 seacrowd/sea_datasets/prdect_id/prdect_id.py | 161 +++++ seacrowd/sea_datasets/qasina/__init__.py | 0 seacrowd/sea_datasets/qasina/qasina.py | 173 ++++++ .../sea_datasets/roots_vi_ted/__init__.py | 0 .../sea_datasets/roots_vi_ted/roots_vi_ted.py | 128 ++++ seacrowd/sea_datasets/saltik/__init__.py | 0 seacrowd/sea_datasets/saltik/saltik.py | 133 ++++ seacrowd/sea_datasets/sampiran/__init__.py | 0 seacrowd/sea_datasets/sap_wat/__init__.py | 0 seacrowd/sea_datasets/sap_wat/sap_wat.py | 175 ++++++ .../sea_datasets/sarawak_malay/__init__.py | 0 .../sarawak_malay/sarawak_malay.py | 178 ++++++ .../sea_datasets/scb_mt_en_th/__init__.py | 0 .../sea_datasets/scb_mt_en_th/scb_mt_en_th.py | 165 +++++ seacrowd/sea_datasets/sea_bench/__init__.py | 0 seacrowd/sea_datasets/sea_bench/sea_bench.py | 193 ++++++ seacrowd/sea_datasets/seaeval/__init__.py | 0 seacrowd/sea_datasets/seaeval/seaeval.py | 238 +++++++ seacrowd/sea_datasets/seahorse/__init__.py | 0 seacrowd/sea_datasets/seahorse/seahorse.py | 194 ++++++ seacrowd/sea_datasets/snli_indo/__init__.py | 0 seacrowd/sea_datasets/snli_indo/snli_indo.py | 158 +++++ seacrowd/sea_datasets/spamid_pair/__init__.py | 0 .../sea_datasets/spamid_pair/spamid_pair.py | 160 +++++ seacrowd/sea_datasets/stb_ext/__init__.py | 0 seacrowd/sea_datasets/stb_ext/stb_ext.py | 195 ++++++ .../sea_datasets/struct_amb_ind/__init__.py | 0 .../struct_amb_ind/struct_amb_ind.py | 174 ++++++ seacrowd/sea_datasets/tatabahasa/__init__.py | 0 .../sea_datasets/tatabahasa/tatabahasa.py | 156 +++++ seacrowd/sea_datasets/tcope/__init__.py | 0 seacrowd/sea_datasets/tcope/tcope.py | 163 +++++ .../sea_datasets/tgl_profanity/__init__.py | 0 .../tgl_profanity/tgl_profanity.py | 115 ++++ .../tha_lao_embassy_parcor/__init__.py | 0 .../tha_lao_embassy_parcor.py | 126 ++++ seacrowd/sea_datasets/thai_alpaca/__init__.py | 0 .../sea_datasets/thai_alpaca/thai_alpaca.py | 108 ++++ .../thai_constitution/__init__.py | 0 .../thai_constitution/thai_constitution.py | 144 +++++ .../thai_databricks_dolly/__init__.py | 0 .../thai_databricks_dolly.py | 114 ++++ .../sea_datasets/thai_depression/__init__.py | 0 .../thai_depression/thai_depression.py | 145 +++++ .../sea_datasets/thai_gpteacher/__init__.py | 0 .../thai_gpteacher/thai_gpteacher.py | 118 ++++ .../sea_datasets/thai_hh_rlhf/__init__.py | 0 .../sea_datasets/thai_hh_rlhf/thai_hh_rlhf.py | 122 ++++ seacrowd/sea_datasets/thai_sum/__init__.py | 0 seacrowd/sea_datasets/thai_sum/thai_sum.py | 144 +++++ .../thai_toxicity_tweet/__init__.py | 0 .../thai_toxicity_tweet.py | 120 ++++ seacrowd/sea_datasets/thaigov/thaigov.py | 196 ++++++ seacrowd/sea_datasets/tico_19/tico_19.py | 128 ++-- .../tmad_malay_corpus/tmad_malay_corpus.py | 140 +++++ seacrowd/sea_datasets/tydiqa/__init__.py | 0 seacrowd/sea_datasets/tydiqa/tydiqa.py | 436 +++++++++++++ seacrowd/sea_datasets/tydiqa_id/tydiqa_id.py | 187 ------ .../typhoon_yolanda_tweets.py | 3 + .../sea_datasets/ucla_phonetic/__init__.py | 0 .../ucla_phonetic/ucla_phonetic.py | 158 +++++ seacrowd/sea_datasets/ud_jv_csui/__init__.py | 0 .../sea_datasets/ud_jv_csui/ud_jv_csui.py | 256 ++++++++ seacrowd/sea_datasets/udhr/udhr.py | 3 + .../sea_datasets/uit_vicov19qa/__init__.py | 0 .../uit_vicov19qa/uit_vicov19qa.py | 167 +++++ seacrowd/sea_datasets/uit_victsd/__init__.py | 0 .../sea_datasets/uit_victsd/uit_victsd.py | 132 ++++ seacrowd/sea_datasets/uit_vihsd/__init__.py | 0 seacrowd/sea_datasets/uit_vihsd/uit_vihsd.py | 145 +++++ seacrowd/sea_datasets/uit_viic/__init__.py | 0 seacrowd/sea_datasets/uit_viic/uit_viic.py | 150 +++++ seacrowd/sea_datasets/uit_viocd/__init__.py | 0 seacrowd/sea_datasets/uit_viocd/uit_viocd.py | 141 +++++ seacrowd/sea_datasets/uit_vion/__init__.py | 0 seacrowd/sea_datasets/uit_vion/uit_vion.py | 170 +++++ seacrowd/sea_datasets/uit_vsfc/__init__.py | 0 seacrowd/sea_datasets/uit_vsfc/uit_vsfc.py | 203 ++++++ seacrowd/sea_datasets/uit_vsmec/__init__.py | 0 seacrowd/sea_datasets/uit_vsmec/uit_vsmec.py | 130 ++++ seacrowd/sea_datasets/unimorph/__init__.py | 0 seacrowd/sea_datasets/unimorph/unimorph.py | 447 ++++++++++++++ seacrowd/sea_datasets/vi_pubmed/__init__.py | 0 seacrowd/sea_datasets/vi_pubmed/vi_pubmed.py | 260 ++++++++ seacrowd/sea_datasets/vihealthqa/__init__.py | 0 .../sea_datasets/vihealthqa/vihealthqa.py | 157 +++++ seacrowd/sea_datasets/visobert/__init__.py | 0 seacrowd/sea_datasets/visobert/visobert.py | 158 +++++ .../sea_datasets/vispamreviews/__init__.py | 0 .../vispamreviews/vispamreviews.py | 179 ++++++ .../sea_datasets/vistec_tp_th_21/__init__.py | 0 .../vistec_tp_th_21/vistec_tp_th_21.py | 183 ++++++ .../sea_datasets/vitext2sql/vitext2sql.py | 2 + seacrowd/sea_datasets/vivos/__init__.py | 0 seacrowd/sea_datasets/vivos/vivos.py | 204 ++++++ seacrowd/sea_datasets/vivqa/__init__.py | 0 seacrowd/sea_datasets/vivqa/vivqa.py | 218 +++++++ .../sea_datasets/vlsp2016_ner/__init__.py | 0 .../sea_datasets/vlsp2016_ner/vlsp2016_ner.py | 164 +++++ seacrowd/sea_datasets/vlsp2016_sa/__init__.py | 0 .../sea_datasets/vlsp2016_sa/vlsp2016_sa.py | 181 ++++++ seacrowd/sea_datasets/vndt/__init__.py | 0 seacrowd/sea_datasets/vndt/utils.py | 61 ++ seacrowd/sea_datasets/vndt/vndt.py | 197 ++++++ seacrowd/sea_datasets/voxlingua/__init__.py | 0 seacrowd/sea_datasets/voxlingua/voxlingua.py | 204 ++++++ seacrowd/sea_datasets/weathub/__init__.py | 0 seacrowd/sea_datasets/weathub/weathub.py | 145 +++++ seacrowd/sea_datasets/wikimatrix/__init__.py | 0 .../sea_datasets/wikimatrix/wikimatrix.py | 277 +++++++++ .../sea_datasets/wikitext_tl_39/__init__.py | 0 .../wikitext_tl_39/wikitext_tl_39.py | 111 ++++ seacrowd/sea_datasets/wili_2018/__init__.py | 0 seacrowd/sea_datasets/wili_2018/wili_2018.py | 359 +++++++++++ .../wisesight_thai_sentiment/__init__.py | 0 .../wisesight_thai_sentiment.py | 177 ++++++ seacrowd/sea_datasets/wit/__init__.py | 0 seacrowd/sea_datasets/wit/wit.py | 274 +++++++++ .../sea_datasets/wongnai_reviews/__init__.py | 0 .../wongnai_reviews/wongnai_reviews.py | 116 ++++ seacrowd/sea_datasets/xcopa/xcopa.py | 84 +-- .../sea_datasets/xl_jailbreak/__init__.py | 0 .../sea_datasets/xl_jailbreak/xl_jailbreak.py | 195 ++++++ seacrowd/sea_datasets/xl_sum/xl_sum.py | 160 +++-- seacrowd/sea_datasets/xm3600/xm3600.py | 8 +- seacrowd/sea_datasets/xnli/__init__.py | 0 seacrowd/sea_datasets/xnli/xnli.py | 214 +++++++ seacrowd/sea_datasets/xquad/xquad.py | 3 + seacrowd/sea_datasets/xstorycloze/__init__.py | 0 .../sea_datasets/xstorycloze/xstorycloze.py | 176 ++++++ .../yunshan_cup_2020/yunshan_cup_2020.py | 3 + seacrowd/utils/common_parser.py | 26 +- seacrowd/utils/configs.py | 2 +- seacrowd/utils/constants.py | 109 ++-- seacrowd/utils/schemas/__init__.py | 6 + seacrowd/utils/schemas/imqa.py | 27 + seacrowd/utils/schemas/tod.py | 74 +++ seacrowd/utils/schemas/tree.py | 115 ++++ seacrowd/utils/schemas/video.py | 2 +- templates/initiate_seacrowd_dataloader.sh | 20 + templates/template.py | 14 +- test_example.sh | 2 +- test_example_source_only.sh | 2 +- tests/test_seacrowd.py | 2 +- 372 files changed, 31731 insertions(+), 681 deletions(-) create mode 100644 .github/workflows/add-new-comment-on-stale.yml rename .github/workflows/{stale.yml => stale-labeler.yml} (67%) delete mode 100644 CONTRIBUTING.md rename seacrowd/sea_datasets/{tydiqa_id => abui_wordnet}/__init__.py (100%) create mode 100644 seacrowd/sea_datasets/abui_wordnet/abui_wordnet.py create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/__init__.py create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py create mode 100644 seacrowd/sea_datasets/ara_close/__init__.py create mode 100644 seacrowd/sea_datasets/ara_close/ara_close.py create mode 100644 seacrowd/sea_datasets/asr_indocsc/__init__.py create mode 100644 seacrowd/sea_datasets/asr_indocsc/asr_indocsc.py create mode 100644 seacrowd/sea_datasets/asr_sindodusc/__init__.py create mode 100644 seacrowd/sea_datasets/asr_sindodusc/asr_sindodusc.py create mode 100644 seacrowd/sea_datasets/asr_smaldusc/__init__.py create mode 100644 seacrowd/sea_datasets/asr_smaldusc/asr_smaldusc.py create mode 100644 seacrowd/sea_datasets/asr_stidusc/__init__.py create mode 100644 seacrowd/sea_datasets/asr_stidusc/asr_stidusc.py create mode 100644 seacrowd/sea_datasets/audio_keyword_spotting/__init__.py create mode 100644 seacrowd/sea_datasets/audio_keyword_spotting/audio_keyword_spotting.py create mode 100644 seacrowd/sea_datasets/aya_dataset/__init__.py create mode 100644 seacrowd/sea_datasets/aya_dataset/aya_dataset.py create mode 100644 seacrowd/sea_datasets/bactrian_x/__init__.py create mode 100644 seacrowd/sea_datasets/bactrian_x/bactrian_x.py create mode 100644 seacrowd/sea_datasets/balita_nlp/__init__.py create mode 100644 seacrowd/sea_datasets/balita_nlp/balita_nlp.py create mode 100644 seacrowd/sea_datasets/beaye_lexicon/__init__.py create mode 100644 seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py create mode 100644 seacrowd/sea_datasets/belebele/__init__.py create mode 100644 seacrowd/sea_datasets/bhinneka_korpus/__init__.py create mode 100644 seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py create mode 100644 seacrowd/sea_datasets/bioner_id/__init__.py create mode 100644 seacrowd/sea_datasets/bioner_id/bioner_id.py create mode 100644 seacrowd/sea_datasets/bloom_captioning/__init__.py create mode 100644 seacrowd/sea_datasets/bloom_captioning/bloom_captioning.py create mode 100644 seacrowd/sea_datasets/bloom_lm/__init__.py create mode 100644 seacrowd/sea_datasets/bloom_lm/bloom_lm.py create mode 100644 seacrowd/sea_datasets/bloom_speech/__init__.py create mode 100644 seacrowd/sea_datasets/bloom_speech/bloom_speech.py create mode 100644 seacrowd/sea_datasets/bloom_vist/__init__.py create mode 100644 seacrowd/sea_datasets/bloom_vist/bloom_vist.py create mode 100644 seacrowd/sea_datasets/burapha_th/__init__.py create mode 100644 seacrowd/sea_datasets/burapha_th/burapha_th.py create mode 100644 seacrowd/sea_datasets/cc3m_35l/__init__.py create mode 100644 seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py create mode 100644 seacrowd/sea_datasets/cc_aligned_doc/__init__.py create mode 100644 seacrowd/sea_datasets/cc_aligned_doc/cc_aligned_doc.py create mode 100644 seacrowd/sea_datasets/cc_aligned_sent/__init__.py create mode 100644 seacrowd/sea_datasets/cc_aligned_sent/cc_aligned_sent.py create mode 100644 seacrowd/sea_datasets/coco_35l/__init__.py create mode 100644 seacrowd/sea_datasets/coco_35l/coco_35l.py create mode 100644 seacrowd/sea_datasets/codeswitch_reddit/__init__.py create mode 100644 seacrowd/sea_datasets/codeswitch_reddit/codeswitch_reddit.py create mode 100644 seacrowd/sea_datasets/commonvoice_120/__init__.py create mode 100644 seacrowd/sea_datasets/commonvoice_120/commonvoice_120.py create mode 100644 seacrowd/sea_datasets/cosem/__init__.py create mode 100644 seacrowd/sea_datasets/cosem/cosem.py create mode 100644 seacrowd/sea_datasets/creole_rc/__init__.py create mode 100644 seacrowd/sea_datasets/creole_rc/creole_rc.py create mode 100644 seacrowd/sea_datasets/crosssum/__init__.py create mode 100644 seacrowd/sea_datasets/crosssum/crosssum.py create mode 100644 seacrowd/sea_datasets/dengue_filipino/__init__.py create mode 100644 seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py create mode 100644 seacrowd/sea_datasets/emotes_3k/__init__.py create mode 100644 seacrowd/sea_datasets/emotes_3k/emotes_3k.py create mode 100644 seacrowd/sea_datasets/etos/__init__.py create mode 100644 seacrowd/sea_datasets/etos/etos.py create mode 100644 seacrowd/sea_datasets/filipino_gay_lang/__init__.py create mode 100644 seacrowd/sea_datasets/filipino_gay_lang/filipino_gay_lang.py create mode 100644 seacrowd/sea_datasets/filipino_hatespeech_election/__init__.py create mode 100644 seacrowd/sea_datasets/filipino_hatespeech_election/filipino_hatespeech_election.py create mode 100644 seacrowd/sea_datasets/filwordnet/__init__.py create mode 100644 seacrowd/sea_datasets/filwordnet/filwordnet.py create mode 100644 seacrowd/sea_datasets/flores200/__init__.py create mode 100644 seacrowd/sea_datasets/flores200/flores200.py create mode 100644 seacrowd/sea_datasets/fsl_105/__init__.py create mode 100644 seacrowd/sea_datasets/fsl_105/fsl_105.py create mode 100644 seacrowd/sea_datasets/gatitos/__init__.py create mode 100644 seacrowd/sea_datasets/gatitos/gatitos.py create mode 100644 seacrowd/sea_datasets/gklmip_newsclass/__init__.py create mode 100644 seacrowd/sea_datasets/gklmip_newsclass/gklmip_newsclass.py create mode 100644 seacrowd/sea_datasets/gklmip_sentiment/__init__.py create mode 100644 seacrowd/sea_datasets/gklmip_sentiment/gklmip_sentiment.py create mode 100644 seacrowd/sea_datasets/globalwoz/__init__.py create mode 100644 seacrowd/sea_datasets/globalwoz/globalwoz.py create mode 100644 seacrowd/sea_datasets/gnome/__init__.py create mode 100644 seacrowd/sea_datasets/gnome/gnome.py create mode 100644 seacrowd/sea_datasets/iapp_squad/__init__.py create mode 100644 seacrowd/sea_datasets/iapp_squad/iapp_squad.py create mode 100644 seacrowd/sea_datasets/iatf/__init__.py create mode 100644 seacrowd/sea_datasets/iatf/iatf.py create mode 100644 seacrowd/sea_datasets/icon/__init__.py create mode 100644 seacrowd/sea_datasets/icon/icon.py create mode 100644 seacrowd/sea_datasets/id_coreference_resolution/__init__.py create mode 100644 seacrowd/sea_datasets/id_msvd/__init__.py create mode 100644 seacrowd/sea_datasets/id_msvd/id_msvd.py create mode 100644 seacrowd/sea_datasets/id_newspaper_2018/__init__.py create mode 100644 seacrowd/sea_datasets/id_newspaper_2018/id_newspaper_2018.py create mode 100644 seacrowd/sea_datasets/id_sent_emo_mobile_apps/__init__.py create mode 100644 seacrowd/sea_datasets/id_sent_emo_mobile_apps/id_sent_emo_mobile_apps.py create mode 100644 seacrowd/sea_datasets/id_sentiment_analysis/__init__.py create mode 100644 seacrowd/sea_datasets/id_sentiment_analysis/id_sentiment_analysis.py create mode 100644 seacrowd/sea_datasets/id_vaccines_tweets/__init__.py create mode 100644 seacrowd/sea_datasets/id_vaccines_tweets/id_vaccines_tweets.py create mode 100644 seacrowd/sea_datasets/id_wsd/__init__.py create mode 100644 seacrowd/sea_datasets/identifikasi_bahasa/__init__.py create mode 100644 seacrowd/sea_datasets/identifikasi_bahasa/identifikasi_bahasa.py create mode 100644 seacrowd/sea_datasets/ind_proner/ind_proner.py create mode 100644 seacrowd/sea_datasets/indo_story_cloze/__init__.py create mode 100644 seacrowd/sea_datasets/indo_story_cloze/indo_story_cloze.py create mode 100644 seacrowd/sea_datasets/indocamrest/__init__.py create mode 100644 seacrowd/sea_datasets/indocamrest/indocamrest.py create mode 100644 seacrowd/sea_datasets/indoler/__init__.py create mode 100644 seacrowd/sea_datasets/indoler/indoler.py create mode 100644 seacrowd/sea_datasets/indommlu/__init__.py create mode 100644 seacrowd/sea_datasets/indommlu/indommlu.py create mode 100644 seacrowd/sea_datasets/indoner_tourism/__init__.py create mode 100644 seacrowd/sea_datasets/indoner_tourism/indoner_tourism.py create mode 100644 seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/__init__.py create mode 100644 seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py create mode 100644 seacrowd/sea_datasets/indonesian_madurese_bible_translation/__init__.py create mode 100644 seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py create mode 100644 seacrowd/sea_datasets/indonesian_news_dataset/__init__.py create mode 100644 seacrowd/sea_datasets/indonesian_news_dataset/indonesian_news_dataset.py create mode 100644 seacrowd/sea_datasets/indonesiannmt/__init__.py create mode 100644 seacrowd/sea_datasets/indonesiannmt/indonesiannmt.py create mode 100644 seacrowd/sea_datasets/indonglish/__init__.py create mode 100644 seacrowd/sea_datasets/indonglish/indonglish.py create mode 100644 seacrowd/sea_datasets/indoqa/__init__.py create mode 100644 seacrowd/sea_datasets/indoqa/indoqa.py create mode 100644 seacrowd/sea_datasets/indosmd/__init__.py create mode 100644 seacrowd/sea_datasets/indosmd/indosmd.py create mode 100644 seacrowd/sea_datasets/indowiki/__init__.py create mode 100644 seacrowd/sea_datasets/indowiki/indowiki.py create mode 100644 seacrowd/sea_datasets/kawat/__init__.py create mode 100644 seacrowd/sea_datasets/kde4/__init__.py create mode 100644 seacrowd/sea_datasets/kde4/kde4.py create mode 100644 seacrowd/sea_datasets/kheng_info/__init__.py create mode 100644 seacrowd/sea_datasets/kheng_info/kheng_info.py create mode 100644 seacrowd/sea_datasets/khpos/__init__.py create mode 100644 seacrowd/sea_datasets/khpos/khpos.py create mode 100644 seacrowd/sea_datasets/lazada_review_filipino/__init__.py create mode 100644 seacrowd/sea_datasets/lazada_review_filipino/lazada_review_filipino.py create mode 100644 seacrowd/sea_datasets/lr_sum/__init__.py create mode 100644 seacrowd/sea_datasets/lr_sum/lr_sum.py create mode 100644 seacrowd/sea_datasets/m3exam/__init__.py create mode 100644 seacrowd/sea_datasets/m3exam/m3exam.py create mode 100644 seacrowd/sea_datasets/malaysia_tweets/__init__.py create mode 100644 seacrowd/sea_datasets/malaysia_tweets/malaysia_tweets.py create mode 100644 seacrowd/sea_datasets/malindo_morph/__init__.py create mode 100644 seacrowd/sea_datasets/malindo_morph/malindo_morph.py create mode 100644 seacrowd/sea_datasets/malindo_parallel/__init__.py create mode 100644 seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py create mode 100644 seacrowd/sea_datasets/massive/__init__.py create mode 100644 seacrowd/sea_datasets/massive/massive.py create mode 100644 seacrowd/sea_datasets/melayu_brunei/__init__.py create mode 100644 seacrowd/sea_datasets/melayu_brunei/melayu_brunei.py create mode 100644 seacrowd/sea_datasets/melayu_sabah/__init__.py create mode 100644 seacrowd/sea_datasets/melayu_sabah/melayu_sabah.py create mode 100644 seacrowd/sea_datasets/melayu_sarawak/__init__.py create mode 100644 seacrowd/sea_datasets/melayu_sarawak/melayu_sarawak.py create mode 100644 seacrowd/sea_datasets/melayu_standard_lisan/__init__.py create mode 100644 seacrowd/sea_datasets/melayu_standard_lisan/melayu_standard_lisan.py create mode 100644 seacrowd/sea_datasets/memolon/__init__.py create mode 100644 seacrowd/sea_datasets/memolon/memolon.py create mode 100644 seacrowd/sea_datasets/miracl/__init__.py create mode 100644 seacrowd/sea_datasets/miracl/miracl.py create mode 100644 seacrowd/sea_datasets/mkqa/__init__.py create mode 100644 seacrowd/sea_datasets/mkqa/mkqa.py create mode 100644 seacrowd/sea_datasets/mozilla_pontoon/__init__.py create mode 100644 seacrowd/sea_datasets/mozilla_pontoon/mozilla_pontoon.py create mode 100644 seacrowd/sea_datasets/mswc/__init__.py create mode 100644 seacrowd/sea_datasets/mswc/mswc.py create mode 100644 seacrowd/sea_datasets/mtop_intent_classification/__init__.py create mode 100644 seacrowd/sea_datasets/mtop_intent_classification/labels.py create mode 100644 seacrowd/sea_datasets/mtop_intent_classification/mtop_intent_classification.py create mode 100644 seacrowd/sea_datasets/muse/__init__.py create mode 100644 seacrowd/sea_datasets/muse/muse.py create mode 100644 seacrowd/sea_datasets/my_paraphrase/__init__.py create mode 100644 seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py create mode 100644 seacrowd/sea_datasets/myanmar_rakhine_parallel/__init__.py create mode 100644 seacrowd/sea_datasets/myanmar_rakhine_parallel/myanmar_rakhine_parallel.py create mode 100644 seacrowd/sea_datasets/mysentence/__init__.py create mode 100644 seacrowd/sea_datasets/mysentence/mysentence.py create mode 100644 seacrowd/sea_datasets/myxnli/__init__.py create mode 100644 seacrowd/sea_datasets/myxnli/myxnli.py create mode 100644 seacrowd/sea_datasets/newsph/__init__.py create mode 100644 seacrowd/sea_datasets/newsph/newsph.py create mode 100644 seacrowd/sea_datasets/ntrex_128/__init__.py create mode 100644 seacrowd/sea_datasets/ntrex_128/ntrex_128.py create mode 100644 seacrowd/sea_datasets/oil/__init__.py create mode 100644 seacrowd/sea_datasets/oil/oil.py create mode 100644 seacrowd/sea_datasets/openlid/__init__.py create mode 100644 seacrowd/sea_datasets/openlid/openlid.py create mode 100644 seacrowd/sea_datasets/openslr/__init__.py create mode 100644 seacrowd/sea_datasets/openslr/openslr.py create mode 100644 seacrowd/sea_datasets/openvivqa/__init__.py create mode 100644 seacrowd/sea_datasets/openvivqa/openvivqa.py create mode 100644 seacrowd/sea_datasets/orchid_pos/__init__.py create mode 100644 seacrowd/sea_datasets/orchid_pos/orchid_pos.py create mode 100644 seacrowd/sea_datasets/palito/__init__.py create mode 100644 seacrowd/sea_datasets/palito/palito.py create mode 100644 seacrowd/sea_datasets/ph_fake_news_corpus/__init__.py create mode 100644 seacrowd/sea_datasets/ph_fake_news_corpus/ph_fake_news_corpus.py create mode 100644 seacrowd/sea_datasets/pho_ner_covid/__init__.py create mode 100644 seacrowd/sea_datasets/pho_ner_covid/pho_ner_covid.py create mode 100644 seacrowd/sea_datasets/phoatis/intent_label.txt create mode 100644 seacrowd/sea_datasets/phoatis/phoatis.py create mode 100644 seacrowd/sea_datasets/phoatis/slot_label.txt create mode 100644 seacrowd/sea_datasets/phomt/__init__.py create mode 100644 seacrowd/sea_datasets/phomt/phomt.py create mode 100644 seacrowd/sea_datasets/prdect_id/__init__.py create mode 100644 seacrowd/sea_datasets/prdect_id/prdect_id.py create mode 100644 seacrowd/sea_datasets/qasina/__init__.py create mode 100644 seacrowd/sea_datasets/qasina/qasina.py create mode 100644 seacrowd/sea_datasets/roots_vi_ted/__init__.py create mode 100644 seacrowd/sea_datasets/roots_vi_ted/roots_vi_ted.py create mode 100644 seacrowd/sea_datasets/saltik/__init__.py create mode 100644 seacrowd/sea_datasets/saltik/saltik.py create mode 100644 seacrowd/sea_datasets/sampiran/__init__.py create mode 100644 seacrowd/sea_datasets/sap_wat/__init__.py create mode 100644 seacrowd/sea_datasets/sap_wat/sap_wat.py create mode 100644 seacrowd/sea_datasets/sarawak_malay/__init__.py create mode 100644 seacrowd/sea_datasets/sarawak_malay/sarawak_malay.py create mode 100644 seacrowd/sea_datasets/scb_mt_en_th/__init__.py create mode 100644 seacrowd/sea_datasets/scb_mt_en_th/scb_mt_en_th.py create mode 100644 seacrowd/sea_datasets/sea_bench/__init__.py create mode 100644 seacrowd/sea_datasets/sea_bench/sea_bench.py create mode 100644 seacrowd/sea_datasets/seaeval/__init__.py create mode 100644 seacrowd/sea_datasets/seaeval/seaeval.py create mode 100644 seacrowd/sea_datasets/seahorse/__init__.py create mode 100644 seacrowd/sea_datasets/seahorse/seahorse.py create mode 100644 seacrowd/sea_datasets/snli_indo/__init__.py create mode 100644 seacrowd/sea_datasets/snli_indo/snli_indo.py create mode 100644 seacrowd/sea_datasets/spamid_pair/__init__.py create mode 100644 seacrowd/sea_datasets/spamid_pair/spamid_pair.py create mode 100644 seacrowd/sea_datasets/stb_ext/__init__.py create mode 100644 seacrowd/sea_datasets/stb_ext/stb_ext.py create mode 100644 seacrowd/sea_datasets/struct_amb_ind/__init__.py create mode 100644 seacrowd/sea_datasets/struct_amb_ind/struct_amb_ind.py create mode 100644 seacrowd/sea_datasets/tatabahasa/__init__.py create mode 100644 seacrowd/sea_datasets/tatabahasa/tatabahasa.py create mode 100644 seacrowd/sea_datasets/tcope/__init__.py create mode 100644 seacrowd/sea_datasets/tcope/tcope.py create mode 100644 seacrowd/sea_datasets/tgl_profanity/__init__.py create mode 100644 seacrowd/sea_datasets/tgl_profanity/tgl_profanity.py create mode 100644 seacrowd/sea_datasets/tha_lao_embassy_parcor/__init__.py create mode 100644 seacrowd/sea_datasets/tha_lao_embassy_parcor/tha_lao_embassy_parcor.py create mode 100644 seacrowd/sea_datasets/thai_alpaca/__init__.py create mode 100644 seacrowd/sea_datasets/thai_alpaca/thai_alpaca.py create mode 100644 seacrowd/sea_datasets/thai_constitution/__init__.py create mode 100644 seacrowd/sea_datasets/thai_constitution/thai_constitution.py create mode 100644 seacrowd/sea_datasets/thai_databricks_dolly/__init__.py create mode 100644 seacrowd/sea_datasets/thai_databricks_dolly/thai_databricks_dolly.py create mode 100644 seacrowd/sea_datasets/thai_depression/__init__.py create mode 100644 seacrowd/sea_datasets/thai_depression/thai_depression.py create mode 100644 seacrowd/sea_datasets/thai_gpteacher/__init__.py create mode 100644 seacrowd/sea_datasets/thai_gpteacher/thai_gpteacher.py create mode 100644 seacrowd/sea_datasets/thai_hh_rlhf/__init__.py create mode 100644 seacrowd/sea_datasets/thai_hh_rlhf/thai_hh_rlhf.py create mode 100644 seacrowd/sea_datasets/thai_sum/__init__.py create mode 100644 seacrowd/sea_datasets/thai_sum/thai_sum.py create mode 100644 seacrowd/sea_datasets/thai_toxicity_tweet/__init__.py create mode 100644 seacrowd/sea_datasets/thai_toxicity_tweet/thai_toxicity_tweet.py create mode 100644 seacrowd/sea_datasets/thaigov/thaigov.py create mode 100644 seacrowd/sea_datasets/tmad_malay_corpus/tmad_malay_corpus.py create mode 100644 seacrowd/sea_datasets/tydiqa/__init__.py create mode 100644 seacrowd/sea_datasets/tydiqa/tydiqa.py delete mode 100644 seacrowd/sea_datasets/tydiqa_id/tydiqa_id.py create mode 100644 seacrowd/sea_datasets/ucla_phonetic/__init__.py create mode 100644 seacrowd/sea_datasets/ucla_phonetic/ucla_phonetic.py create mode 100644 seacrowd/sea_datasets/ud_jv_csui/__init__.py create mode 100644 seacrowd/sea_datasets/ud_jv_csui/ud_jv_csui.py create mode 100644 seacrowd/sea_datasets/uit_vicov19qa/__init__.py create mode 100644 seacrowd/sea_datasets/uit_vicov19qa/uit_vicov19qa.py create mode 100644 seacrowd/sea_datasets/uit_victsd/__init__.py create mode 100644 seacrowd/sea_datasets/uit_victsd/uit_victsd.py create mode 100644 seacrowd/sea_datasets/uit_vihsd/__init__.py create mode 100644 seacrowd/sea_datasets/uit_vihsd/uit_vihsd.py create mode 100644 seacrowd/sea_datasets/uit_viic/__init__.py create mode 100644 seacrowd/sea_datasets/uit_viic/uit_viic.py create mode 100644 seacrowd/sea_datasets/uit_viocd/__init__.py create mode 100644 seacrowd/sea_datasets/uit_viocd/uit_viocd.py create mode 100644 seacrowd/sea_datasets/uit_vion/__init__.py create mode 100644 seacrowd/sea_datasets/uit_vion/uit_vion.py create mode 100644 seacrowd/sea_datasets/uit_vsfc/__init__.py create mode 100644 seacrowd/sea_datasets/uit_vsfc/uit_vsfc.py create mode 100644 seacrowd/sea_datasets/uit_vsmec/__init__.py create mode 100644 seacrowd/sea_datasets/uit_vsmec/uit_vsmec.py create mode 100644 seacrowd/sea_datasets/unimorph/__init__.py create mode 100644 seacrowd/sea_datasets/unimorph/unimorph.py create mode 100644 seacrowd/sea_datasets/vi_pubmed/__init__.py create mode 100644 seacrowd/sea_datasets/vi_pubmed/vi_pubmed.py create mode 100644 seacrowd/sea_datasets/vihealthqa/__init__.py create mode 100644 seacrowd/sea_datasets/vihealthqa/vihealthqa.py create mode 100644 seacrowd/sea_datasets/visobert/__init__.py create mode 100644 seacrowd/sea_datasets/visobert/visobert.py create mode 100644 seacrowd/sea_datasets/vispamreviews/__init__.py create mode 100644 seacrowd/sea_datasets/vispamreviews/vispamreviews.py create mode 100644 seacrowd/sea_datasets/vistec_tp_th_21/__init__.py create mode 100644 seacrowd/sea_datasets/vistec_tp_th_21/vistec_tp_th_21.py create mode 100644 seacrowd/sea_datasets/vivos/__init__.py create mode 100644 seacrowd/sea_datasets/vivos/vivos.py create mode 100644 seacrowd/sea_datasets/vivqa/__init__.py create mode 100644 seacrowd/sea_datasets/vivqa/vivqa.py create mode 100644 seacrowd/sea_datasets/vlsp2016_ner/__init__.py create mode 100644 seacrowd/sea_datasets/vlsp2016_ner/vlsp2016_ner.py create mode 100644 seacrowd/sea_datasets/vlsp2016_sa/__init__.py create mode 100644 seacrowd/sea_datasets/vlsp2016_sa/vlsp2016_sa.py create mode 100644 seacrowd/sea_datasets/vndt/__init__.py create mode 100644 seacrowd/sea_datasets/vndt/utils.py create mode 100644 seacrowd/sea_datasets/vndt/vndt.py create mode 100644 seacrowd/sea_datasets/voxlingua/__init__.py create mode 100644 seacrowd/sea_datasets/voxlingua/voxlingua.py create mode 100644 seacrowd/sea_datasets/weathub/__init__.py create mode 100644 seacrowd/sea_datasets/weathub/weathub.py create mode 100644 seacrowd/sea_datasets/wikimatrix/__init__.py create mode 100644 seacrowd/sea_datasets/wikimatrix/wikimatrix.py create mode 100644 seacrowd/sea_datasets/wikitext_tl_39/__init__.py create mode 100644 seacrowd/sea_datasets/wikitext_tl_39/wikitext_tl_39.py create mode 100644 seacrowd/sea_datasets/wili_2018/__init__.py create mode 100644 seacrowd/sea_datasets/wili_2018/wili_2018.py create mode 100644 seacrowd/sea_datasets/wisesight_thai_sentiment/__init__.py create mode 100644 seacrowd/sea_datasets/wisesight_thai_sentiment/wisesight_thai_sentiment.py create mode 100644 seacrowd/sea_datasets/wit/__init__.py create mode 100644 seacrowd/sea_datasets/wit/wit.py create mode 100644 seacrowd/sea_datasets/wongnai_reviews/__init__.py create mode 100644 seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py create mode 100644 seacrowd/sea_datasets/xl_jailbreak/__init__.py create mode 100644 seacrowd/sea_datasets/xl_jailbreak/xl_jailbreak.py create mode 100644 seacrowd/sea_datasets/xnli/__init__.py create mode 100644 seacrowd/sea_datasets/xnli/xnli.py create mode 100644 seacrowd/sea_datasets/xstorycloze/__init__.py create mode 100644 seacrowd/sea_datasets/xstorycloze/xstorycloze.py create mode 100644 seacrowd/utils/schemas/imqa.py create mode 100644 seacrowd/utils/schemas/tod.py create mode 100644 seacrowd/utils/schemas/tree.py create mode 100644 templates/initiate_seacrowd_dataloader.sh diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a72c04c54..9627e345f 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,3 +1,3 @@ # These are the current maintainers/admin of the seacrowd-datahub repo -* @holylovenia @samuelcahyawijaya @sabilmakbar @jamesjaya @yongzx @gentaiscool @ljvmiranda921 @RosenZhang @fajri91 +* @holylovenia @samuelcahyawijaya @sabilmakbar @jamesjaya @yongzx @gentaiscool @ljvmiranda921 @jen-santoso @danjohnvelasco @MJonibek @tellarin diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index d60dc2956..a8f500579 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,11 +1,17 @@ -Please name your PR after the issue it closes. You can use the following line: "Closes #ISSUE-NUMBER" where you replace the ISSUE-NUMBER with the one corresponding to your dataset. +Please name your PR title and the first line of PR message after the issue it will close. You can use the following examples: + +**Title**: Closes #{ISSUE_NUMBER} | Add/Update Dataloader {DATALOADER_NAME} + +**First line PR Message**: Closes #{ISSUE_NUMBER} + +where you replace the {ISSUE_NUMBER} with the one corresponding to your dataset. ### Checkbox - [ ] Confirm that this PR is linked to the dataset issue. -- [ ] Create the dataloader script `seacrowd/sea_datasets/my_dataset/my_dataset.py` (please use only lowercase and underscore for dataset naming). -- [ ] Provide values for the `_CITATION`, `_DATASETNAME`, `_DESCRIPTION`, `_HOMEPAGE`, `_LICENSE`, `_URLs`, `_SUPPORTED_TASKS`, `_SOURCE_VERSION`, and `_SEACROWD_VERSION` variables. +- [ ] Create the dataloader script `seacrowd/sea_datasets/{my_dataset}/{my_dataset}.py` (please use only lowercase and underscore for dataset folder naming, as mentioned in dataset issue) and its `__init__.py` within `{my_dataset}` folder. +- [ ] Provide values for the `_CITATION`, `_DATASETNAME`, `_DESCRIPTION`, `_HOMEPAGE`, `_LICENSE`, `_LOCAL`, `_URLs`, `_SUPPORTED_TASKS`, `_SOURCE_VERSION`, and `_SEACROWD_VERSION` variables. - [ ] Implement `_info()`, `_split_generators()` and `_generate_examples()` in dataloader script. - [ ] Make sure that the `BUILDER_CONFIGS` class attribute is a list with at least one `SEACrowdConfig` for the source schema and one for a seacrowd schema. - [ ] Confirm dataloader script works with `datasets.load_dataset` function. -- [ ] Confirm that your dataloader script passes the test suite run with `python -m tests.test_seacrowd seacrowd/sea_datasets//.py`. +- [ ] Confirm that your dataloader script passes the test suite run with `python -m tests.test_seacrowd seacrowd/sea_datasets//.py` or `python -m tests.test_seacrowd seacrowd/sea_datasets//.py --subset_id {subset_name_without_source_or_seacrowd_suffix}`. - [ ] If my dataset is local, I have provided an output of the unit-tests in the PR (please copy paste). This is OPTIONAL for public datasets, as we can test these without access to the data files. diff --git a/.github/workflows/add-new-comment-on-stale.yml b/.github/workflows/add-new-comment-on-stale.yml new file mode 100644 index 000000000..26ccfd46a --- /dev/null +++ b/.github/workflows/add-new-comment-on-stale.yml @@ -0,0 +1,43 @@ +# This workflow is a continuation of "Mark stale issues and pull requests" workflow, on adding customized comment. + +# You can adjust the behavior by modifying this file. +# For more information, see: +# https://github.com/peter-evans/create-or-update-comment +name: Adding reminder comment on staled issues & PRs +on: + issues: + types: + - labeled + # read these to see why it uses 'pull_request_target' instead of 'pull_request': + # 1. https://securitylab.github.com/research/github-actions-preventing-pwn-requests/ + # 2. https://github.com/peter-evans/create-or-update-comment?tab=readme-ov-file#action-inputs (note section) + pull_request_target: + types: + - labeled + +jobs: + add-comment-on-staled-issue: + if: github.event.label.name == 'staled-issue' + runs-on: ubuntu-latest + permissions: + issues: write + + steps: + - name: Remind assignee on staled Issue + uses: peter-evans/create-or-update-comment@v2 + with: + issue-number: ${{github.event.issue.number}} + body: "Hi @${{github.event.issue.assignee.login}}, may I know if you are still working on this issue? Please let @holylovenia @SamuelCahyawijaya @sabilmakbar know if you need any help." + + add-comment-on-staled-pr: + if: github.event.label.name == 'need-fu-pr' + runs-on: ubuntu-latest + permissions: + pull-requests: write + + steps: + - name: Remind assignee and author on staled PR + uses: peter-evans/create-or-update-comment@v2 + with: + issue-number: ${{github.event.pull_request.number}} + body: "Hi @${{join(github.event.pull_request.assignees.*.login, ', @')}} & @${{github.event.pull_request.user.login}}, may I know if you are still working on this PR?" diff --git a/.github/workflows/stale.yml b/.github/workflows/stale-labeler.yml similarity index 67% rename from .github/workflows/stale.yml rename to .github/workflows/stale-labeler.yml index 800329e06..07339c308 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale-labeler.yml @@ -10,8 +10,8 @@ on: - cron: '20 1 * * *' jobs: - stale: - + stale_detection: + name: Detect Stale Issues/PR runs-on: ubuntu-latest permissions: issues: write @@ -20,11 +20,15 @@ jobs: steps: - uses: actions/stale@v8 with: + repo-token: ${{ secrets.GITHUB_TOKEN }} - stale-issue-message: 'Hi, may I know if you are still working on this issue? Please let @holylovenia @SamuelCahyawijaya @sabilmakbar know if you need any help.' - stale-issue-label: 'staled-issue' + # only labels the stale, the comment addition will be handled by another workflow + stale-issue-message: "" + stale-pr-message: "" + stale-issue-label: "staled-issue" + stale-pr-label: "need-fu-pr" days-before-stale: 14 days-before-close: -1 include-only-assigned: true exempt-issue-labels: 'in-progress,pr-ready' - operations-per-run: 100 + operations-per-run: 200 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 5093649f5..000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,208 +0,0 @@ -# Guideline for contributing a dataloader implementation - -## Pre-Requisites - -Please make a GitHub account prior to implementing a dataset; you can follow the instructions to install git [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). - -You will also need at least Python 3.6+. If you are installing Python, we recommend downloading [anaconda](https://docs.anaconda.com/anaconda/install/index.html) to curate a Python environment with the necessary packages. **We strongly recommend Python 3.8+ for stability**. - -**Optional** Setup your GitHub account with SSH ([instructions here](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).) - -### 1. **Assigning a dataloader** -- Choose a dataset from the [list of SEACrowd datasets](https://github.com/orgs/SEACrowd/projects/1/views/1). -

- -

- -- Assign yourself an issue by commenting `#self-assign` under the issue. **Please assign yourself to issues with no other collaborators assigned**. You should see your GitHub username associated with the issue within 1-2 minutes of making a comment. - -

- -

- -- Search to see if the dataset exists in the 🤗 [Hub](https://huggingface.co/datasets). If it exists, please use the current implementation as the `source` and focus on implementing the [task-specific seacrowd schema](https://github.com/SEACrowd/seacrowd-datahub/blob/master/task_schemas.md). - -- If not, find the dataset online, usually uploaded in Github or Google Drive. - -### 2. **Setup a local version of the SEACrowd repo** -Fork the seacrowd-datahub [repository](https://github.com/SEACrowd/seacrowd-datahub) to your local Github account. To do this, click the link to the repository and click "fork" in the upper-right corner. - -After you fork, clone the repository locally. You can do so as follows: - - git clone git@github.com:/seacrowd-datahub.git - cd seacrowd-datahub # enter the directory - -Next, you want to set your `upstream` location to enable you to push/pull (add or receive updates). You can do so as follows: - - git remote add upstream git@github.com:SEACrowd/seacrowd-datahub.git - -You can optionally check that this was set properly by running the following command: - - git remote -v - -The output of this command should look as follows: - - origin git@github.com:/seacrowd-datahub.git (fetch) - origin git@github.com:/seacrowd-datahub.git (push) - upstream git@github.com:SEACrowd/seacrowd-datahub.git (fetch) - upstream git@github.com:SEACrowd/seacrowd-datahub.git (push) - -If you do NOT have an `origin` for whatever reason, then run: - - git remote add origin git@github.com:/seacrowd-datahub.git - -The goal of `upstream` is to keep your repository up-to-date with any changes made officially to the datasets library. You can do this as follows by running the following commands: - - git fetch upstream - git pull - -Provided you have no *merge conflicts*, this will ensure the library stays up-to-date as you make changes. However, before you make changes, you should make a custom branch to implement your changes. - -You can make a new branch as such: - - git checkout -b - -

Please do not make changes on the master branch!

- -Always make sure you're on the right branch with the following command: - - git branch - -The correct branch will have an asterisk \* in front of it. - -### 2. **Create a development environment** -You can make an environment in any way you choose. We highlight two possible options: - -#### 2a) Create a conda environment - -The following instructions will create an Anaconda `env-seacrowd-datahub` environment. - -- Install [anaconda](https://docs.anaconda.com/anaconda/install/) for your appropriate operating system. -- Run the following command while in the `sea_datasets` folder (you can pick your python version): - -``` -conda env create -f conda.yml # Creates a conda env -conda activate env-seacrowd-datahub # Activate your conda environment -``` - -You can deactivate your environment at any time by either exiting your terminal or using `conda deactivate`. - -#### 2b) Create a venv environment - -Python 3.3+ has venv automatically installed; official information is found [here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/). - -``` -python3 -m venv -source /bin/activate # activate environment -pip install -r requirements.txt # Install this while in the datasets folder -``` -Make sure your `pip` package points to your environment's source. - -### 3. Implement your dataloader - -Make a new directory within the `SEACrowd/seacrowd-datahub/sea_datasets` directory: - - mkdir seacrowd-datahub/sea_datasets/ - -Please use lowercase letters and underscores when choosing a ``. -To implement your dataset, there are three key methods that are important: - - * `_info`: Specifies the schema of the expected dataloader - * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. - * `_generate_examples`: Create examples from data that conform to each schema defined in `_info`. - -To start, copy [templates/template.py](templates/template.py) to your `seacrowd/sea_datasets/` directory with the name `.py`. Within this file, fill out all the TODOs. - - cp templates/template.py seacrowd/sea_datasets//.py - -For the `_info_` function, you will need to define `features` for your -`DatasetInfo` object. For the `bigbio` config, choose the right schema from our list of examples. You can find a description of these in the [Task Schemas Document](task_schemas.md). You can find the actual schemas in the [schemas directory](seacrowd/utils/schemas). - -You will use this schema in the `_generate_examples` return value. - -Populate the information in the dataset according to this schema; some fields may be empty. - -To enable quality control, please add the following line in your file before the class definition: -```python -from seacrowd.utils.constants import Tasks -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.DEPENDENCY_PARSING] -``` - -##### Example scripts: -To help you implement a dataset, you can see the implementation of [other dataset scripts](seacrowd/sea_datasets). - -#### Running & Debugging: -You can run your data loader script during development by appending the following -statement to your code ([templates/template.py](templates/template.py) already includes this): - -```python -if __name__ == "__main__": - datasets.load_dataset(__file__) -``` - -If you want to use an interactive debugger during development, you will have to use -`breakpoint()` instead of setting breakpoints directly in your IDE. Most IDEs will -recognize the `breakpoint()` statement and pause there during debugging. If your prefered -IDE doesn't support this, you can always run the script in your terminal and debug with -`pdb`. - - -### 4. Check if your dataloader works - -Make sure your dataset is implemented correctly by checking in python the following commands: - -```python -from datasets import load_dataset - -data = load_dataset("seacrowd/sea_datasets//.py", name="_seacrowd_") -``` - -Run these commands from the top level of the `nusa-crowd` repo (i.e. the same directory that contains the `requirements.txt` file). - -Once this is done, please also check if your dataloader satisfies our unit tests as follows by using this command in the terminal: - -```bash -python -m tests.test_seacrowd seacrowd/sea_datasets//.py [--data_dir /path/to/local/data] -``` - -Your particular dataset may require use of some of the other command line args in the test script. -To view full usage instructions you can use the `--help` command, - -```bash -python -m tests.test_seacrowd --help -``` - -### 5. Format your code - -From the main directory, run the Makefile via the following command: - - make check_file=seacrowd/sea_datasets//.py - -This runs the black formatter, isort, and lints to ensure that the code is readable and looks nice. Flake8 linting errors may require manual changes. - -### 6. Commit your changes - -First, commit your changes to the branch to "add" the work: - - git add seacrowd/sea_datasets//.py - git commit -m "A message describing your commits" - -Then, run the following commands to incorporate any new changes in the master branch of datasets as follows: - - git fetch upstream - git rebase upstream/master - -Or you can install the pre-commit hooks to automatically pre-check before commit by: - - pre-commit install -**Run these commands in your custom branch**. - -Push these changes to **your fork** with the following command: - - git push -u origin - -### 7. **Make a pull request** - -Make a Pull Request to implement your changes on the main repository [here](https://github.com/SEACrowd/seacrowd-datahub/pulls). To do so, click "New Pull Request". Then, choose your branch from your fork to push into "base:master". - -When opening a PR, please link the [issue](https://github.com/SEACrowd/seacrowd-datahub/issues) corresponding to your dataset using [closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) in the PR's description, e.g. `resolves #17`. diff --git a/DATALOADER.md b/DATALOADER.md index 5093649f5..1a94104ee 100644 --- a/DATALOADER.md +++ b/DATALOADER.md @@ -100,20 +100,21 @@ Make sure your `pip` package points to your environment's source. ### 3. Implement your dataloader -Make a new directory within the `SEACrowd/seacrowd-datahub/sea_datasets` directory: +Use this bash script to initialize your new dataloader folder along with template of your dataloader script under `SEACrowd/seacrowd-datahub/sea_datasets` directory using this: - mkdir seacrowd-datahub/sea_datasets/ + sh templates/initiate_seacrowd_dataloader.sh +The value of `` can be checked on the issue ticket that you were assigned to. -Please use lowercase letters and underscores when choosing a ``. +i.e: for this [issue ticket](https://github.com/SEACrowd/seacrowd-datahub/issues/32), the dataloader name indicates `Dataloader name: xl_sum/xl_sum.py`, hence the value of `` is `xl_sum`. + +Please use PascalCase when choosing a ``. To implement your dataset, there are three key methods that are important: * `_info`: Specifies the schema of the expected dataloader * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. * `_generate_examples`: Create examples from data that conform to each schema defined in `_info`. -To start, copy [templates/template.py](templates/template.py) to your `seacrowd/sea_datasets/` directory with the name `.py`. Within this file, fill out all the TODOs. - - cp templates/template.py seacrowd/sea_datasets//.py +After the bash above has been executed, you'll have your `seacrowd/sea_datasets/` directory existed with the name `.py`. Within this file, fill out all the TODOs based on the template. For the `_info_` function, you will need to define `features` for your `DatasetInfo` object. For the `bigbio` config, choose the right schema from our list of examples. You can find a description of these in the [Task Schemas Document](task_schemas.md). You can find the actual schemas in the [schemas directory](seacrowd/utils/schemas). @@ -133,7 +134,7 @@ To help you implement a dataset, you can see the implementation of [other datase #### Running & Debugging: You can run your data loader script during development by appending the following -statement to your code ([templates/template.py](templates/template.py) already includes this): +statement to your code (if you have your dataloader folder initialized using previous bash script, it already includes this, else you may add these by yourself): ```python if __name__ == "__main__": @@ -157,7 +158,7 @@ from datasets import load_dataset data = load_dataset("seacrowd/sea_datasets//.py", name="_seacrowd_") ``` -Run these commands from the top level of the `nusa-crowd` repo (i.e. the same directory that contains the `requirements.txt` file). +Run these commands from the top level of the `seacrowd/seacrowd-datahub` repo (i.e. the same directory that contains the `requirements.txt` file). Once this is done, please also check if your dataloader satisfies our unit tests as follows by using this command in the terminal: @@ -195,6 +196,7 @@ Then, run the following commands to incorporate any new changes in the master br Or you can install the pre-commit hooks to automatically pre-check before commit by: pre-commit install + **Run these commands in your custom branch**. Push these changes to **your fork** with the following command: diff --git a/POINTS.md b/POINTS.md index d495c523b..f7d295517 100644 --- a/POINTS.md +++ b/POINTS.md @@ -1,6 +1,6 @@ # Contribution point guideline -To be considered as a co-author, 20 contribution points are required. +To be considered as a co-author, 20 contribution points are required. To monitor how many points that you have obtained, the contribution point tracking is now live at [this sheet](https://docs.google.com/spreadsheets/d/e/2PACX-1vQDZtJjA6i7JsxS5IlMtVuwOYjr2Pbl_b47yMSH4aAdHDBIpf-CiJQjNQAzcJPEu_aE7kwH4ZvKvPm0/pubhtml?gid=225616890&single=true) and will be updated regularly (although not automatically, yet)! > **Note**: The purpose of the point system is not to barrier collaboration, but to reward rare and high-quality dataset entries. We might adjust the point requirement lower to accommodate more co-authorship if needed. @@ -8,7 +8,7 @@ We might adjust the point requirement lower to accommodate more co-authorship if | Contribution type | Demand | Points | Max points | Job description | | ------------------------------ | ------------------- | ------ | ----------------------- | ------------------------------------------------------------------------------------------------------------------------ | | Public Datasheet Submission | As many as possible | 2+bonus | 6 | Submit public datasheet via [jotform](https://www.jotform.com/team/232952680898069/seacrowd-sea-datasets) | -| Private Datasheet Submission | As many as possible | 1 | | Submit private datasheet via [jotform](https://www.jotform.com/team/232952680898069/seacrowd-sea-datasets) | +| Private Datasheet Submission | As many as possible | 1 | | Submit private datasheet via [jotform](https://www.jotform.com/team/232952680898069/seacrowd-paper-with-private-dataset) | | Open Access to Private Dataset | As many as possible | 4+bonus | 10 for the high-quality | Only private dataset owners can do this. Upload the data in a public repository and submit the datasheet in [jotform](https://www.jotform.com/team/232952680898069/seacrowd-sea-datasets). | | Dataloader Implementation | As many as possible | 3 | 6 for the hard one | Implement dataloader based on the respective dataset's schema and task. | diff --git a/README.md b/README.md index db338543d..ae1404473 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ South East Asia is home to more than 1,000 native languages. Nevertheless, South ## How to contribute? -You can contribute by proposing **unregistered NLP dataset** on [our record](https://seacrowd.github.io/seacrowd-catalogue/). [Just fill out this form](https://jotform.com/team/232952680898069/seacrowd-sea-datasets), and we will check and approve your entry. +You can contribute by proposing **unregistered NLP dataset** on [our approved record](https://seacrowd.github.io/seacrowd-catalogue/) and our [in-review datasets](https://docs.google.com/spreadsheets/d/1ibbywsC1tQ_sLPX8bUAjC-vrTrUqZgZA46W_sxWw4Ss/edit?usp=sharing). [Just fill out this form](https://jotform.com/team/232952680898069/seacrowd-sea-datasets), and we will check and approve your entry if it meets our requirements (see [this](https://github.com/SEACrowd/seacrowd-datahub/blob/master/REVIEWING.md#approval-checklist) for the detailed checklist). We will give **contribution points** based on several factors, including: **supported modality**, **language scarcity**, or **task scarcity**. @@ -55,8 +55,7 @@ The license for a dataset is not always obvious. Here are some strategies to try If no official license is listed anywhere, but you find a webpage that describes general data usage policies for the dataset, you can fall back to providing that URL in the `_LICENSE` variable. If you can't find any license information, please note in your PR and put `_LICENSE="Unknown"` in your dataset script. #### What if my dataset is not yet publicly available? - -You can upload your dataset publicly first, eg. on Github. +You can upload your dataset publicly first, eg. on Github. If you're an owner of a Private Dataset that is being contacted by SEACrowd Representative for a possibility of opening that dataset, you may visit this [Private Dataset FAQ](PRIVATE.md). #### Can I create a PR if I have an idea? @@ -71,7 +70,7 @@ Yes, you can ask for helps in SEACrowd's community channel! Please join our [Dis We greatly appreciate your help! -The artifacts of this initiative will be described in a forthcoming academic paper targeting a machine learning or NLP audience. Please refer to [this section](#contribution-guidelines) for your contribution rewards for helping South-East Asian NLP. We recognize that some datasets require more effort than others, so please reach out if you have questions. Our goal is to be inclusive with credit! +The artifacts of this initiative will be described in a forthcoming academic paper targeting a machine learning or NLP audience. Please refer to [this section](https://github.com/SEACrowd#how-much-should-i-contribute) for your contribution rewards in helping South-East Asian NLP. We recognize that some datasets require more effort than others, so please reach out if you have questions. Our goal is to be inclusive with credit! ## Acknowledgements diff --git a/REVIEWING.md b/REVIEWING.md index 3463587d2..78ebfe540 100644 --- a/REVIEWING.md +++ b/REVIEWING.md @@ -51,3 +51,51 @@ Check the following before approving: 3. Check the scoring guide and see which languages gets additional points (if any). 4. Add the dataloader name (use Python snake case) 5. Wait for a GitHub issue to be generated for the approved datasheet. + + +# Dataloader Reviewer SOP + +The objective of datasheet review is to ensure that all dataloaders in SEACrowd conform to the HF Dataloader Structure and SEACrowd-defined schema and config and follow a similar code format and/or style. + +### Dataloader Check +1. Metadata correctness. (ensure Tasks, Languages, HOME_URL, DATA_URL is used). Make sure the dataloader also has `__init__.py`. +2. All subsets are implemented correctly to respective dataloader issue and according to SEACrowd Schema definition (has both `source` and `seacrowd` schema -- if a given task has its SEACrowd Schema, else can raise it to reviewers/mods). +3. Pass the test scripts defined in `tests` folder. +4. Pass manual check. + a. Perform a sampling of configs based on Lang and/or Task combinations + b. Execute `datasets.load_dataset` check based on config list (a) + c. Check on the dataset schema & few first examples for plausibility. +5. Follows some general rules/conventions: + a. Use `PascalCase` for the dataloader class name (optional: "Dataset" can be appended to the Dataloader class name; see `templates/template.py` for example). + b. Use lowercase word characters (regex identifier: `\w`) for schema column names, including the `source` schema if the original dataset doesn't follow it. +6. The code aligns with the `black` formatter. Hint: +use this `make check_file=seacrowd/sea_datasets/{dataloader}/{dataloader}.py` +7. Follows Dataloader Config Rules (will be described in the following) + +### Dataloader Config Rules +Based on the compulsory Dataloader Configs listed on Datasheet Issue, the dataset are divided into 4 different types: +1. Single Subset, Single Task (Type 1) +2. Multiple Subsets, Single Task (Type 2) +3. Single Subset, Multiple Task (Type 3) +4. Multiple Subsets, Multiple Task (Type 4) + + **Note for Multilingual Dataset:** + + For a multilingual dataset, generally, it falls under multiple subsets type (since one language is considered as a standalone subset of the dataloder) unless it's influencing the label heavily or it doesn't make sense to split the data based on the languages (for instance, in the case of Lang Identification or Linguistic Features/Unit Identification). + +Based on aforementioned types, the checklist for Config Correctness is as follows: +1. For type 1 & 3, both config of `f”{_DATASETNAME}_source”` and `f”{_DATASETNAME}_seacrowd_{TASK_TO_SCHEMA}”` must be implemented. +2. For type 2 and 4, the dataloader config in (1) generally shouldn't be implemented (case-by-case checking can be done if needed). Consequently, it must cover all listed subsets in Dataloader Issue. +3. The formatting for config names that have multiple subsets are + 1. `f”{_DATASETNAME}_{subset_name}_source”` + 2. `f”{_DATASETNAME}_{subset_name}_seacrowd_{TASK_TO_SCHEMA}”` + + **If the subset name contains language info, the lang identifier should be in a `ISO_639_3` lang code.** +3. For point (2), since it won't pass the test-cases using the default args, a custom arg must be provided by the Dataloader PR creator (or Dataloader Issue Assignee) to ensure the reproducibility of Testing among reviewers. The reviewers can add the testing args if necessary. + +## Approval and Dataloader Reviewer Assignment Process +1. Every dataloader requires 2 reviewers per issue (the assignee must not review their own dataloader). +2. Once the second reviewer is approved, the PR can be merged to the `master` branch using the `squash and merge` strategy for a cleaner commit history. +3. For the Reviewers' Assignment, there are two possible ways: + 1. @holylovenia will assign and monitor reviewers once a week to maintain and balance the load and overall pace. It will prioritize dataloaders used for experiments, then on reverse chronological order based on PR created time. + 2. Any reviewers can take any unassigned PR as long as the review can be done promptly. diff --git a/requirements.txt b/requirements.txt index 6c6df4099..9b4635f8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,8 @@ openpyxl==3.1.2 translate-toolkit==3.7.3 typing_extensions scikit-learn==1.1.2 +pyarrow +opencv-python>=4.9 +textgrid==1.5 +audiosegment==0.23.0 +pyreadr==0.5.0 \ No newline at end of file diff --git a/seacrowd/sea_datasets/tydiqa_id/__init__.py b/seacrowd/sea_datasets/abui_wordnet/__init__.py similarity index 100% rename from seacrowd/sea_datasets/tydiqa_id/__init__.py rename to seacrowd/sea_datasets/abui_wordnet/__init__.py diff --git a/seacrowd/sea_datasets/abui_wordnet/abui_wordnet.py b/seacrowd/sea_datasets/abui_wordnet/abui_wordnet.py new file mode 100644 index 000000000..a31d438bf --- /dev/null +++ b/seacrowd/sea_datasets/abui_wordnet/abui_wordnet.py @@ -0,0 +1,146 @@ +# coding=utf-8 + + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{kratochvil-morgado-da-costa-2022-abui, +title = "{A}bui {W}ordnet: Using a Toolbox Dictionary to develop a wordnet for a low-resource language", +author = "Kratochvil, Frantisek and + Morgado da Costa, Lu{\'}s", +editor = "Serikov, Oleg and + Voloshina, Ekaterina and + Postnikova, Anna and + Klyachko, Elena and + Neminova, Ekaterina and + Vylomova, Ekaterina and + Shavrina, Tatiana and + Ferrand, Eric Le and + Malykh, Valentin and + Tyers, Francis and + Arkhangelskiy, Timofey and + Mikhailov, Vladislav and + Fenogenova, Alena", +booktitle = "Proceedings of the first workshop on NLP applications to field linguistics", +month = oct, +year = "2022", +address = "Gyeongju, Republic of Korea", +publisher = "International Conference on Computational Linguistics", +url = "https://aclanthology.org/2022.fieldmatters-1.7", +pages = "54--63", +abstract = "This paper describes a procedure to link a Toolbox dictionary of a low-resource language to correct +synsets, generating a new wordnet. We introduce a bootstrapping technique utilising the information in the gloss +fields (English, national, and regional) to generate sense candidates using a naive algorithm based on +multilingual sense intersection. We show that this technique is quite effective when glosses are available in +more than one language. Our technique complements the previous work by Rosman et al. (2014) which linked the +SIL Semantic Domains to wordnet senses. Through this work we have created a small, fully hand-checked wordnet +for Abui, containing over 1,400 concepts and 3,600 senses.", +} +""" +_DATASETNAME = "abui_wordnet" +_DESCRIPTION = """\ +A small fully hand-checked wordnet for Abui, containing over 1,400 concepts and 3,600 senses, is created. A +bootstrapping technique is introduced to utilise the information in the gloss fields (English, national, and regional) +to generate sense candidates using a naive algorithm based on multilingual sense intersection. +""" + +_HOMEPAGE = "https://github.com/fanacek/abuiwn" +_LANGUAGES = ["abz"] +_LICENSE = Licenses.CC_BY_4_0.value +_LOCAL = False +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/fanacek/abuiwn/main/abwn_lmf.tsv", +} + +_SUPPORTED_TASKS = [Tasks.WORD_ANALOGY] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class AbuiwordnetDataset(datasets.GeneratorBasedBuilder): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=_DESCRIPTION, + schema="source", + subset_id="abui_wordnet", + ), + # SEACrowdConfig( + # name="abui_wordnet_seacrowd_ww", + # version=SEACROWD_VERSION, + # description="abuiw SEACrowd schema", + # schema="seacrowd_a", + # subset_id="abui_wordnet", + # ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + features = None + if self.config.schema == "source": + features = datasets.Features( + { + "sense": datasets.Value("string"), + "pos": datasets.Value("string"), + "lang": datasets.Value("string"), + "lemma": datasets.Value("string"), + "form": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_pair": + features = schemas.pairs_features + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name="senses", + gen_kwargs={ + "filepath": data_dir, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + with open(filepath, "r") as filein: + data_instances = [inst.strip("\n").split("\t") for inst in filein.readlines()] + if self.config.schema == "source": + for idx, example in enumerate(data_instances): + sense = example[0] + pos = example[0][-1] + lang = example[1] + lemma = example[2] + form = "" if len(example) == 3 else example[3] + yield idx, { + "sense": sense, + "pos": pos, + "lang": lang, + "lemma": lemma, + "form": form, + } + # elif self.config.schema == "seacrowd_pair": + # diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/__init__.py b/seacrowd/sea_datasets/alt_burmese_treebank/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py b/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py new file mode 100644 index 000000000..ad781f105 --- /dev/null +++ b/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py @@ -0,0 +1,151 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.sea_datasets.alt_burmese_treebank.utils.alt_burmese_treebank_utils import extract_data +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{ + 10.1145/3373268, + author = {Ding, Chenchen and Yee, Sann Su Su and Pa, Win Pa and Soe, Khin Mar and Utiyama, Masao and Sumita, Eiichiro}, + title = {A Burmese (Myanmar) Treebank: Guideline and Analysis}, + year = {2020}, + issue_date = {May 2020}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + volume = {19}, + number = {3}, + issn = {2375-4699}, + url = {https://doi.org/10.1145/3373268}, + doi = {10.1145/3373268}, + abstract = {A 20,000-sentence Burmese (Myanmar) treebank on news articles has been released under a CC BY-NC-SA license.\ + Complete phrase structure annotation was developed for each sentence from the morphologically annotated data\ + prepared in previous work of Ding et al. [1]. As the final result of the Burmese component in the Asian\ + Language Treebank Project, this is the first large-scale, open-access treebank for the Burmese language.\ + The annotation details and features of this treebank are presented.\ + }, + journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.}, + month = {jan}, + articleno = {40}, + numpages = {13}, + keywords = {Burmese (Myanmar), phrase structure, treebank} +} +""" + +_DATASETNAME = "alt_burmese_treebank" + +_DESCRIPTION = """\ +A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\ +As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\ +open-access treebank for the Burmese language. +""" + +_HOMEPAGE = "https://zenodo.org/records/3463010" + +_LANGUAGES = ["mya"] + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://zenodo.org/records/3463010/files/my-alt-190530.zip?download=1", +} + +_SUPPORTED_TASKS = [Tasks.CONSTITUENCY_PARSING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class AltBurmeseTreebank(datasets.GeneratorBasedBuilder): + """A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\ + As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\ + open-access treebank for the Burmese language.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_tree", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_tree", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string")}) + elif self.config.schema == "seacrowd_tree": + features = schemas.tree_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "my-alt-190530/data"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + with open(filepath, "r") as f: + for idx, line in enumerate(f): + example = {"id": line.split("\t")[0], "text": line.split("\t")[1]} + yield idx, example + + elif self.config.schema == "seacrowd_tree": + with open(filepath, "r") as f: + for idx, line in enumerate(f): + example = extract_data(line) + yield idx, example diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py new file mode 100644 index 000000000..3e78e6cc6 --- /dev/null +++ b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py @@ -0,0 +1,70 @@ +import re + + +def extract_parts(input_string): + parts = [] + stack = [] + current_part = "" + + for char in input_string: + if char == "(": + stack.append("(") + elif char == ")": + if stack: + stack.pop() + if not stack: + parts.append(current_part[1:].strip()) + current_part = "" + else: + parts.append(current_part[1:].strip()) + current_part = "" + if stack: + current_part += char + + return parts + + +def extract_sentence(input_string): + innermost_pattern = re.compile(r"\(([^()]+)\)") + innermost_matches = re.findall(innermost_pattern, input_string) + extracted_sentence = " ".join(match.split()[1] for match in innermost_matches) + if len(extracted_sentence) == 0: + extracted_sentence = " ".join(input_string.split()[1:]) + return extracted_sentence + + +def extract_data(sentence): + nodes = [] + sub_nodes = {} + sub_node_ids = [] + + # Extract id, sub_nodes and text of ROOT + sentence_id = sentence.split("\t")[0] + root_sent = sentence[sentence.find("ROOT") : -1] + root_subnodes = extract_parts(root_sent) + sub_nodes.update({i + 1: root_subnodes[i] for i in range(len(root_subnodes))}) + sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))]) + root_text = extract_sentence(root_sent) + + nodes.append({"id": f"{sentence_id+'.'+str(0)}", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{sentence_id+'.'+str(i)}" for i in sub_node_ids]}) + + while sub_node_ids: + sub_node_id = sub_node_ids.pop(0) + text = extract_sentence(sub_nodes[sub_node_id]) + + cur_subnodes = extract_parts(sub_nodes[sub_node_id]) + + if len(cur_subnodes) > 0: + id_to_add = sub_node_ids[-1] if len(sub_node_ids) > 0 else sub_node_id + cur_subnode_ids = [id_to_add + i + 1 for i in range(len(cur_subnodes))] + sub_nodes.update({id_to_add + i + 1: cur_subnodes[i] for i in range(len(cur_subnodes))}) + sub_node_ids.extend(cur_subnode_ids) + else: + cur_subnode_ids = [] + + node_type = sub_nodes[sub_node_id].split(" ")[0] + start = root_text.find(text) + end = start + len(text) - 1 + + nodes.append({"id": f"{sentence_id+'.'+str(sub_node_id)}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{sentence_id+'.'+str(i)}" for i in cur_subnode_ids]}) + return {"id": sentence_id, "passage": {"id": sentence_id + "_0", "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes} diff --git a/seacrowd/sea_datasets/ara_close/__init__.py b/seacrowd/sea_datasets/ara_close/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/ara_close/ara_close.py b/seacrowd/sea_datasets/ara_close/ara_close.py new file mode 100644 index 000000000..a4b6aaa48 --- /dev/null +++ b/seacrowd/sea_datasets/ara_close/ara_close.py @@ -0,0 +1,194 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" \ +The dataset contribution of this study is a compilation of short fictional stories \ +written in Bikol for readability assessment. The data was combined other collected \ +Philippine language corpora, such as Tagalog and Cebuano. The data from these languages \ +are all distributed across the Philippine elementary system's first three grade \ +levels (L1, L2, L3). We sourced this dataset from Let's Read Asia (LRA), Bloom Library, \ +Department of Education, and Adarna House. +""" + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{imperial-kochmar-2023-automatic, + title = "Automatic Readability Assessment for Closely Related Languages", + author = "Imperial, Joseph Marvin and + Kochmar, Ekaterina", + editor = "Rogers, Anna and + Boyd-Graber, Jordan and + Okazaki, Naoaki", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2023", + month = jul, + year = "2023", + address = "Toronto, Canada", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.findings-acl.331", + doi = "10.18653/v1/2023.findings-acl.331", + pages = "5371--5386", + abstract = "In recent years, the main focus of research on automatic readability assessment (ARA) \ + has shifted towards using expensive deep learning-based methods with the primary goal of increasing models{'} accuracy. \ + This, however, is rarely applicable for low-resource languages where traditional handcrafted features are still \ + widely used due to the lack of existing NLP tools to extract deeper linguistic representations. In this work, \ + we take a step back from the technical component and focus on how linguistic aspects such as mutual intelligibility \ + or degree of language relatedness can improve ARA in a low-resource setting. We collect short stories written in three \ + languages in the Philippines{---}Tagalog, Bikol, and Cebuano{---}to train readability assessment models and explore the \ + interaction of data and features in various cross-lingual setups. Our results show that the inclusion of CrossNGO, \ + a novel specialized feature exploiting n-gram overlap applied to languages with high mutual intelligibility, \ + significantly improves the performance of ARA models compared to the use of off-the-shelf large multilingual \ + language models alone. Consequently, when both linguistic representations are combined, we achieve state-of-the-art \ + results for Tagalog and Cebuano, and baseline scores for ARA in Bikol.", +} +""" + +_DATASETNAME = "ara_close" + +_DESCRIPTION = """\ +The dataset contribution of this study is a compilation of short fictional stories \ +written in Bikol for readability assessment. The data was combined other collected \ +Philippine language corpora, such as Tagalog and Cebuano. The data from these languages \ +are all distributed across the Philippine elementary system's first three grade \ +levels (L1, L2, L3). We sourced this dataset from Let's Read Asia (LRA), Bloom Library, \ +Department of Education, and Adarna House. \ +""" + +_HOMEPAGE = "https://github.com/imperialite/ara-close-lang" + +_LANGUAGES = ["bcl", "ceb"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CC_BY_4_0.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + "bcl": "https://raw.githubusercontent.com/imperialite/ara-close-lang/main/data/bikol/bik_all_data.txt", + # 'tgl': '', # file for tgl language was deleted + "ceb": "https://raw.githubusercontent.com/imperialite/ara-close-lang/main/data/cebuano/ceb_all_data.txt", +} + +_SUPPORTED_TASKS = [Tasks.READABILITY_ASSESSMENT] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class AraCloseDataset(datasets.GeneratorBasedBuilder): + f"""{_DESCRIPTION}""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [SEACrowdConfig(name=f"{_DATASETNAME}_{lang}_source", version=datasets.Version(_SOURCE_VERSION), description=f"{_DATASETNAME} source schema", schema="source", subset_id=f"{_DATASETNAME}",) for lang in _LANGUAGES] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}", + ) + for lang in _LANGUAGES + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "title": datasets.Value("string"), + "text": datasets.Value("string"), + "label": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(["1", "2", "3"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + lang = self.config.name.split("_")[2] + if lang in _LANGUAGES: + data_path = Path(dl_manager.download_and_extract(_URLS[lang])) + else: + data_path = [Path(dl_manager.download_and_extract(_URLS[lang])) for lang in _LANGUAGES] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + lang = self.config.name.split("_")[2] + if lang in _LANGUAGES: + file_content = open(filepath, "r").readlines() + else: + file_content = [] + for path in filepath: + lines = open(path, "r").readlines() + file_content.extend(lines) + + if self.config.schema == "source": + idx = 0 + for line in file_content: + split_data = line.strip().split(",") + title = split_data[0] + label = split_data[1] + text = ",".join(split_data[2:]) + ex = {"title": title, "text": text, "label": label} + yield idx, ex + idx += 1 + + elif self.config.schema == "seacrowd_text": + idx = 0 + for line in file_content: + split_data = line.strip().split(",") + title = split_data[0] + label = split_data[1] + text = ",".join(split_data[2:]) + ex = { + "id": idx, + "text": text, + "label": label, + } + yield idx, ex + idx += 1 + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/asr_indocsc/__init__.py b/seacrowd/sea_datasets/asr_indocsc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/asr_indocsc/asr_indocsc.py b/seacrowd/sea_datasets/asr_indocsc/asr_indocsc.py new file mode 100644 index 000000000..6d2c367d4 --- /dev/null +++ b/seacrowd/sea_datasets/asr_indocsc/asr_indocsc.py @@ -0,0 +1,192 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# no bibtex citation +_CITATION = "" +_DATASETNAME = "asr_indocsc" +_DESCRIPTION = """\ +This open-source dataset consists of 4.54 hours of transcribed Indonesian +conversational speech on certain topics, where seven conversations between two +pairs of speakers were contained. Please create an account and be logged in on +https://magichub.com to download the data. +""" + +_HOMEPAGE = "https://magichub.com/datasets/indonesian-conversational-speech-corpus/" +_LANGUAGES = ["ind"] +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value +_LOCAL = False +_URLS = { + _DATASETNAME: "https://magichub.com/df/df.php?file_name=Indonesian_Conversational_Speech_Corpus.zip", +} +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ASRIndocscDataset(datasets.GeneratorBasedBuilder): + """ASR-Indocsc consists transcribed Indonesian conversational speech on certain topics""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "sptext" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "channel": datasets.Value("string"), + "uttrans_id": datasets.Value("string"), + "speaker_id": datasets.Value("string"), + "topic": datasets.Value("string"), + "text": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "speaker_gender": datasets.Value("string"), + "speaker_age": datasets.Value("int64"), + "speaker_region": datasets.Value("string"), + "speaker_device": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_paths = { + _DATASETNAME: Path(dl_manager.download_and_extract(_URLS[_DATASETNAME])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_paths[_DATASETNAME], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + # read AUDIOINFO file + # columns: channel, uttrans_id, speaker_id, topic + audioinfo_filepath = os.path.join(filepath, "AUDIOINFO.txt") + with open(audioinfo_filepath, "r", encoding="utf-8") as audioinfo_file: + audioinfo_data = audioinfo_file.readlines() + audioinfo_data = audioinfo_data[1:] # remove header + audioinfo_data = [s.strip("\n").split("\t") for s in audioinfo_data] + + # read SPKINFO file + # columns: channel, speaker_id, gender, age, region, device + spkinfo_filepath = os.path.join(filepath, "SPKINFO.txt") + with open(spkinfo_filepath, "r", encoding="utf-8") as spkinfo_file: + spkinfo_data = spkinfo_file.readlines() + spkinfo_data = spkinfo_data[1:] # remove header + spkinfo_data = [s.strip("\n").split("\t") for s in spkinfo_data] + for i, s in enumerate(spkinfo_data): + if s[2] == "M": + s[2] = "male" + elif s[2] == "F": + s[2] = "female" + else: + s[2] = None + # dictionary of metadata of each speaker + spkinfo_dict = {s[1]: {"speaker_gender": s[2], "speaker_age": int(s[3]), "speaker_region": s[4], "speaker_device": s[5]} for s in spkinfo_data} + + num_sample = len(audioinfo_data) + + for i in range(num_sample): + # wav file + wav_path = os.path.join(filepath, "WAV", audioinfo_data[i][1]) + # transcription file + transcription_path = os.path.join(filepath, "TXT", audioinfo_data[i][1].replace("wav", "txt")) + with open(transcription_path, "r", encoding="utf-8") as transcription_file: + transcription = transcription_file.readlines() + # remove redundant speaker info from transcription file + transcription = [s.strip("\n").split("\t") for s in transcription] + transcription = [s[-1] for s in transcription] + text = " \n ".join(transcription) + + if self.config.schema == "source": + example = { + "id": audioinfo_data[i][1].strip(".wav"), + "channel": audioinfo_data[i][0], + "uttrans_id": audioinfo_data[i][1], + "speaker_id": audioinfo_data[i][2], + "topic": audioinfo_data[i][3], + "text": text, + "path": wav_path, + "audio": wav_path, + "speaker_gender": spkinfo_dict[audioinfo_data[i][2]]["speaker_gender"], + "speaker_age": spkinfo_dict[audioinfo_data[i][2]]["speaker_age"], + "speaker_region": spkinfo_dict[audioinfo_data[i][2]]["speaker_region"], + "speaker_device": spkinfo_dict[audioinfo_data[i][2]]["speaker_device"], + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = { + "id": audioinfo_data[i][1].strip(".wav"), + "speaker_id": audioinfo_data[i][2], + "path": wav_path, + "audio": wav_path, + "text": text, + "metadata": {"speaker_age": spkinfo_dict[audioinfo_data[i][2]]["speaker_age"], "speaker_gender": spkinfo_dict[audioinfo_data[i][2]]["speaker_gender"]}, + } + + yield i, example diff --git a/seacrowd/sea_datasets/asr_sindodusc/__init__.py b/seacrowd/sea_datasets/asr_sindodusc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/asr_sindodusc/asr_sindodusc.py b/seacrowd/sea_datasets/asr_sindodusc/asr_sindodusc.py new file mode 100644 index 000000000..a7ed319c0 --- /dev/null +++ b/seacrowd/sea_datasets/asr_sindodusc/asr_sindodusc.py @@ -0,0 +1,180 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# no bibtex citation +_CITATION = "" +_DATASETNAME = "asr_sindodusc" +_DESCRIPTION = """\ +This open-source dataset consists of 3.5 hours of transcribed Indonesian +scripted speech focusing on daily use sentences, where 3,296 utterances +contributed by ten speakers were contained. +""" + +_HOMEPAGE = "https://magichub.com/datasets/indonesian-scripted-speech-corpus-daily-use-sentence/" +_LANGUAGES = ["ind"] +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value +_LOCAL = False +_URLS = { + _DATASETNAME: "https://magichub.com/df/df.php?file_name=Indonesian_Scripted_Speech_Corpus_Daily_Use_Sentence.zip", +} +_SUPPORTED_TASKS = [Tasks.TEXT_TO_SPEECH, Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ASRSindodusc(datasets.GeneratorBasedBuilder): + """ASR-Sindodusc consists transcribed Indonesian scripted speech focusing on daily use sentences""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "sptext" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "channel": datasets.Value("string"), + "uttrans_id": datasets.Value("string"), + "speaker_id": datasets.Value("string"), + "transcription": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "speaker_gender": datasets.Value("string"), + "speaker_age": datasets.Value("int64"), + "speaker_region": datasets.Value("string"), + "speaker_device": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_paths = { + _DATASETNAME: Path(dl_manager.download_and_extract(_URLS[_DATASETNAME])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_paths[_DATASETNAME], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + # read UTTRANSINFO file + # columns: channel, uttrans_id, speaker_id, prompt (empty field), transcription + uttransinfo_filepath = os.path.join(filepath, "UTTRANSINFO.txt") + with open(uttransinfo_filepath, "r", encoding="utf-8") as uttransinfo_file: + uttransinfo_data = uttransinfo_file.readlines() + uttransinfo_data = uttransinfo_data[1:] # remove header + uttransinfo_data = [s.strip("\n").split("\t") for s in uttransinfo_data] + + # read SPKINFO file + # columns: channel, speaker_id, gender, age, region, device + spkinfo_filepath = os.path.join(filepath, "SPKINFO.txt") + with open(spkinfo_filepath, "r", encoding="utf-8") as spkinfo_file: + spkinfo_data = spkinfo_file.readlines() + spkinfo_data = spkinfo_data[1:] # remove header + spkinfo_data = [s.strip("\n").split("\t") for s in spkinfo_data] + for i, s in enumerate(spkinfo_data): + if s[2] == "M": + s[2] = "male" + elif s[2] == "F": + s[2] = "female" + else: + s[2] = None + # dictionary of metadata of each speaker + spkinfo_dict = {s[1]: {"speaker_gender": s[2], "speaker_age": int(s[3]), "speaker_region": s[4], "speaker_device": s[5]} for s in spkinfo_data} + + num_sample = len(uttransinfo_data) + + for i in range(num_sample): + wav_path = os.path.join(filepath, "WAV", uttransinfo_data[i][2], uttransinfo_data[i][1]) + + if self.config.schema == "source": + example = { + "id": str(i), + "channel": uttransinfo_data[i][0], + "uttrans_id": uttransinfo_data[i][1], + "speaker_id": uttransinfo_data[i][2], + "transcription": uttransinfo_data[i][4], + "path": wav_path, + "audio": wav_path, + "speaker_gender": spkinfo_dict[uttransinfo_data[i][2]]["speaker_gender"], + "speaker_age": spkinfo_dict[uttransinfo_data[i][2]]["speaker_age"], + "speaker_region": spkinfo_dict[uttransinfo_data[i][2]]["speaker_region"], + "speaker_device": spkinfo_dict[uttransinfo_data[i][2]]["speaker_device"], + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = { + "id": str(i), + "speaker_id": uttransinfo_data[i][2], + "path": wav_path, + "audio": wav_path, + "text": uttransinfo_data[i][4], + "metadata": {"speaker_age": spkinfo_dict[uttransinfo_data[i][2]]["speaker_age"], "speaker_gender": spkinfo_dict[uttransinfo_data[i][2]]["speaker_gender"]}, + } + + yield i, example diff --git a/seacrowd/sea_datasets/asr_smaldusc/__init__.py b/seacrowd/sea_datasets/asr_smaldusc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/asr_smaldusc/asr_smaldusc.py b/seacrowd/sea_datasets/asr_smaldusc/asr_smaldusc.py new file mode 100644 index 000000000..007c296de --- /dev/null +++ b/seacrowd/sea_datasets/asr_smaldusc/asr_smaldusc.py @@ -0,0 +1,182 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# no bibtex citation +_CITATION = "" +_DATASETNAME = "asr_smaldusc" +_DESCRIPTION = """\ +This open-source dataset consists of 4.8 hours of transcribed Malay scripted +speech focusing on daily use sentences, where 2,839 utterances contributed by +ten speakers were contained. +""" + +_HOMEPAGE = "https://magichub.com/datasets/malay-scripted-speech-corpus-daily-use-sentence/" +_LANGUAGES = ["zlm"] +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value +_LOCAL = False +_URLS = { + _DATASETNAME: "https://magichub.com/df/df.php?file_name=Malay_Scripted_Speech_Corpus_Daily_Use_Sentence.zip", +} +_SUPPORTED_TASKS = [Tasks.TEXT_TO_SPEECH, Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ASRSmaldusc(datasets.GeneratorBasedBuilder): + """ASR-Smaldusc consists transcribed Malay scripted speech focusing on daily use sentences.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "sptext" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "channel": datasets.Value("string"), + "uttrans_id": datasets.Value("string"), + "speaker_id": datasets.Value("string"), + "prompt": datasets.Value("string"), + "transcription": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "speaker_gender": datasets.Value("string"), + "speaker_age": datasets.Value("int64"), + "speaker_region": datasets.Value("string"), + "speaker_device": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_paths = { + _DATASETNAME: Path(dl_manager.download_and_extract(_URLS[_DATASETNAME])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_paths[_DATASETNAME], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + # read UTTRANSINFO file + # columns: channel, uttrans_id, speaker_id, prompt, transcription + uttransinfo_filepath = os.path.join(filepath, "UTTRANSINFO.txt") + with open(uttransinfo_filepath, "r", encoding="utf-8") as uttransinfo_file: + uttransinfo_data = uttransinfo_file.readlines() + uttransinfo_data = uttransinfo_data[1:] # remove header + uttransinfo_data = [s.strip("\n").split("\t") for s in uttransinfo_data] + + # read SPKINFO file + # columns: channel, speaker_id, gender, age, region, device + spkinfo_filepath = os.path.join(filepath, "SPKINFO.txt") + with open(spkinfo_filepath, "r", encoding="utf-8") as spkinfo_file: + spkinfo_data = spkinfo_file.readlines() + spkinfo_data = spkinfo_data[1:] # remove header + spkinfo_data = [s.strip("\n").split("\t") for s in spkinfo_data] + for i, s in enumerate(spkinfo_data): + if s[2] == "M": + s[2] = "male" + elif s[2] == "F": + s[2] = "female" + else: + s[2] = None + # dictionary of metadata of each speaker + spkinfo_dict = {s[1]: {"speaker_gender": s[2], "speaker_age": int(s[3]), "speaker_region": s[4], "speaker_device": s[5]} for s in spkinfo_data} + + num_sample = len(uttransinfo_data) + + for i in range(num_sample): + wav_path = os.path.join(filepath, "WAV", uttransinfo_data[i][2], uttransinfo_data[i][1]) + + if self.config.schema == "source": + example = { + "id": str(i), + "channel": uttransinfo_data[i][0], + "uttrans_id": uttransinfo_data[i][1], + "speaker_id": uttransinfo_data[i][2], + "prompt": uttransinfo_data[i][3], + "transcription": uttransinfo_data[i][4], + "path": wav_path, + "audio": wav_path, + "speaker_gender": spkinfo_dict[uttransinfo_data[i][2]]["speaker_gender"], + "speaker_age": spkinfo_dict[uttransinfo_data[i][2]]["speaker_age"], + "speaker_region": spkinfo_dict[uttransinfo_data[i][2]]["speaker_region"], + "speaker_device": spkinfo_dict[uttransinfo_data[i][2]]["speaker_device"], + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = { + "id": str(i), + "speaker_id": uttransinfo_data[i][2], + "path": wav_path, + "audio": wav_path, + "text": uttransinfo_data[i][4], + "metadata": {"speaker_age": spkinfo_dict[uttransinfo_data[i][2]]["speaker_age"], "speaker_gender": spkinfo_dict[uttransinfo_data[i][2]]["speaker_gender"]}, + } + + yield i, example diff --git a/seacrowd/sea_datasets/asr_stidusc/__init__.py b/seacrowd/sea_datasets/asr_stidusc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/asr_stidusc/asr_stidusc.py b/seacrowd/sea_datasets/asr_stidusc/asr_stidusc.py new file mode 100644 index 000000000..e03d6b9f6 --- /dev/null +++ b/seacrowd/sea_datasets/asr_stidusc/asr_stidusc.py @@ -0,0 +1,178 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# no bibtex citation +_CITATION = "" +_DATASETNAME = "asr_stidusc" +_DESCRIPTION = """\ +This open-source dataset consists of 4.56 hours of transcribed Thai scripted +speech focusing on daily use sentences, where 5,431 utterances contributed by +ten speakers were contained. +""" + +_HOMEPAGE = "https://magichub.com/datasets/thai-scripted-speech-corpus-daily-use-sentence/" +_LANGUAGES = ["tha"] +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value +_LOCAL = False +_URLS = { + _DATASETNAME: "https://magichub.com/df/df.php?file_name=Thai_Scripted_Speech_Corpus_Daily_Use_Sentence.zip", +} +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ASRSTIDuSCDataset(datasets.GeneratorBasedBuilder): + """ASR-STIDuSC consists transcribed Thai scripted speech focusing on daily use sentences""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "sptext" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "channel": datasets.Value("string"), + "uttrans_id": datasets.Value("string"), + "speaker_id": datasets.Value("string"), + "transcription": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "speaker_gender": datasets.Value("string"), + "speaker_age": datasets.Value("int64"), + "speaker_region": datasets.Value("string"), + "speaker_device": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_paths = { + _DATASETNAME: Path(dl_manager.download_and_extract(_URLS[_DATASETNAME])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_paths[_DATASETNAME], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + # read UTTRANSINFO file + # columns: channel, uttrans_id, speaker_id, prompt (empty field), transcription + uttransinfo_filepath = os.path.join(filepath, "UTTRANSINFO.txt") + with open(uttransinfo_filepath, "r", encoding="utf-8") as uttransinfo_file: + uttransinfo_data = uttransinfo_file.readlines() + uttransinfo_data = uttransinfo_data[1:] # remove header + uttransinfo_data = [s.strip("\n").split("\t") for s in uttransinfo_data] + + # read SPKINFO file + # columns: channel, speaker_id, gender, age, region, device + spkinfo_filepath = os.path.join(filepath, "SPKINFO.txt") + with open(spkinfo_filepath, "r", encoding="utf-8") as spkinfo_file: + spkinfo_data = spkinfo_file.readlines() + spkinfo_data = spkinfo_data[1:] # remove header + spkinfo_data = [s.strip("\n").split("\t") for s in spkinfo_data] + for i, s in enumerate(spkinfo_data): + if s[2] == "M": + s[2] = "male" + elif s[2] == "F": + s[2] = "female" + else: + s[2] = None + # dictionary of metadata of each speaker + spkinfo_dict = {s[1]: {"speaker_gender": s[2], "speaker_age": int(s[3]), "speaker_region": s[4], "speaker_device": s[5]} for s in spkinfo_data} + + for i, sample in enumerate(uttransinfo_data): + wav_path = os.path.join(filepath, "WAV", sample[2], sample[1]) + + if self.config.schema == "source": + example = { + "id": str(i), + "channel": sample[0], + "uttrans_id": sample[1], + "speaker_id": sample[2], + "transcription": sample[4], + "path": wav_path, + "audio": wav_path, + "speaker_gender": spkinfo_dict[sample[2]]["speaker_gender"], + "speaker_age": spkinfo_dict[sample[2]]["speaker_age"], + "speaker_region": spkinfo_dict[sample[2]]["speaker_region"], + "speaker_device": spkinfo_dict[sample[2]]["speaker_device"], + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = { + "id": str(i), + "speaker_id": sample[2], + "path": wav_path, + "audio": wav_path, + "text": sample[4], + "metadata": {"speaker_age": spkinfo_dict[sample[2]]["speaker_age"], "speaker_gender": spkinfo_dict[sample[2]]["speaker_gender"]}, + } + + yield i, example diff --git a/seacrowd/sea_datasets/audio_keyword_spotting/__init__.py b/seacrowd/sea_datasets/audio_keyword_spotting/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/audio_keyword_spotting/audio_keyword_spotting.py b/seacrowd/sea_datasets/audio_keyword_spotting/audio_keyword_spotting.py new file mode 100644 index 000000000..6e6413978 --- /dev/null +++ b/seacrowd/sea_datasets/audio_keyword_spotting/audio_keyword_spotting.py @@ -0,0 +1,198 @@ +""" +SEA Crowd Data Loader for Audio Keyword Spotting. +""" +from typing import Dict, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +# since the dataset doesn't have any citation and it was derived using someone else's work, this citation variable will cite source work instead (total of 3, ML Spoken Words 1 and Trabina 2) +_CITATION = r""" +@inproceedings{mazumder2021multilingual, + title={Multilingual Spoken Words Corpus}, + author={Mazumder, Mark and Chitlangia, Sharad and Banbury, Colby and Kang, Yiping and Ciro, Juan Manuel and Achorn, Keith and Galvez, Daniel and Sabini, Mark and Mattson, Peter and Kanter, David and others}, + booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)}, + year={2021} +} +@inproceedings{wu-etal-2018-creating, + title = "Creating a Translation Matrix of the {B}ible{'}s Names Across 591 Languages", + author = "Wu, Winston and + Vyas, Nidhi and + Yarowsky, David", + editor = "Calzolari, Nicoletta and + Choukri, Khalid and + Cieri, Christopher and + Declerck, Thierry and + Goggi, Sara and + Hasida, Koiti and + Isahara, Hitoshi and + Maegaard, Bente and + Mariani, Joseph and + Mazo, H{\'e}l{\`e}ne and + Moreno, Asuncion and + Odijk, Jan and + Piperidis, Stelios and + Tokunaga, Takenobu", + booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", + month = may, + year = "2018", + address = "Miyazaki, Japan", + publisher = "European Language Resources Association (ELRA)", + url = "https://aclanthology.org/L18-1263", +} +@inproceedings{wu-yarowsky-2018-comparative, + title = "A Comparative Study of Extremely Low-Resource Transliteration of the World{'}s Languages", + author = "Wu, Winston and + Yarowsky, David", + editor = "Calzolari, Nicoletta and + Choukri, Khalid and + Cieri, Christopher and + Declerck, Thierry and + Goggi, Sara and + Hasida, Koiti and + Isahara, Hitoshi and + Maegaard, Bente and + Mariani, Joseph and + Mazo, H{\'e}l{\`e}ne and + Moreno, Asuncion and + Odijk, Jan and + Piperidis, Stelios and + Tokunaga, Takenobu", + booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)", + month = may, + year = "2018", + address = "Miyazaki, Japan", + publisher = "European Language Resources Association (ELRA)", + url = "https://aclanthology.org/L18-1150", +} +""" + +logger = datasets.logging.get_logger(__name__) + +_LOCAL = False +_LANGUAGES = ["ind"] + +_DATASETNAME = "audio_keyword_spotting" +_DESCRIPTION = r"This dataset is a ASR for short text & voices, focusing in identifying common words (or keywords) with entities of Person name and Place Name found in Bible, as found in trabina (https://github.com/wswu/trabina)." + +_HOMEPAGE = "https://huggingface.co/datasets/sil-ai/audio-keyword-spotting" +_LICENSE = Licenses.CC_BY_4_0.value + +_URL = "https://huggingface.co/datasets/sil-ai/audio-keyword-spotting" +_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] +_SOURCE_VERSION = "0.0.1" +_SEACROWD_VERSION = "1.0.0" + +CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS] + + +def construct_configs() -> List[SEACrowdConfig]: + """ + The function `construct_configs` constructs a list of SEACrowdConfig objects and returns the config list. + + input: + None + output: + a list of `SEACrowdConfig` objects based on instantiated init variables + """ + + # set output var + config_list = [] + + # construct zipped arg for config instantiation + TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK)) + + # implement source schema + version, config_name_prefix = _SOURCE_VERSION, "source" + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{config_name_prefix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema", + schema=f"{config_name_prefix}", + subset_id=config_name_prefix, + ) + ] + + # implement SEACrowd schema + version, config_name_prefix = _SEACROWD_VERSION, "seacrowd" + for task_obj, config_name_suffix in TASKS_AND_CONFIG_SUFFIX_PAIRS: + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{config_name_prefix}_{config_name_suffix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name}", + schema=f"{config_name_prefix}_{config_name_suffix}", + subset_id=config_name_prefix, + ) + ] + return config_list + + +class AudioKeywordSpottingDataset(datasets.GeneratorBasedBuilder): + """AudioKeywordSpotting dataset, subsetted from https://huggingface.co/datasets/sil-ai/audio-keyword-spotting""" + + # get all schema w/o lang arg + get all schema w/ lang arg + BUILDER_CONFIGS = construct_configs() + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + _config_schema_name = self.config.schema + logger.info(f"Received schema name: {self.config.schema}") + # source schema + if _config_schema_name == "source": + _GENDERS = ["MALE", "FEMALE", "OTHER", "NAN"] + features = datasets.Features( + { + "file": datasets.Value("string"), + "is_valid": datasets.Value("bool"), + "language": datasets.Value("string"), + "speaker_id": datasets.Value("string"), + "gender": datasets.ClassLabel(names=_GENDERS), + "keyword": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + } + ) + + # speech-text schema + elif _config_schema_name == "seacrowd_sptext": + features = schemas.speech_text_features + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + hf_dset_dict = datasets.load_dataset(_HF_REMOTE_REF, "ind") + + return [datasets.SplitGenerator(name=datasets.Split(dset_key), gen_kwargs={"hf_dset": dset}) for dset_key, dset in hf_dset_dict.items() if dset.num_rows > 0] + + def _generate_examples(self, hf_dset) -> Tuple[int, Dict]: + _config_schema_name = self.config.schema + + _idx = 0 + for datapoints in hf_dset: + # since no _idx is available to be used, we're creating it manually for both schema + if _config_schema_name == "source": + yield _idx, {colname: datapoints[colname] for colname in self.info.features} + + elif _config_schema_name == "seacrowd_sptext": + yield _idx, {"id": _idx, "path": datapoints["file"], "audio": datapoints["audio"], "text": datapoints["keyword"], "speaker_id": datapoints["speaker_id"], "metadata": {"speaker_age": None, "speaker_gender": datapoints["gender"]}} + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + _idx += 1 diff --git a/seacrowd/sea_datasets/aya_dataset/__init__.py b/seacrowd/sea_datasets/aya_dataset/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/aya_dataset/aya_dataset.py b/seacrowd/sea_datasets/aya_dataset/aya_dataset.py new file mode 100644 index 000000000..b95d5d896 --- /dev/null +++ b/seacrowd/sea_datasets/aya_dataset/aya_dataset.py @@ -0,0 +1,188 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The Aya Dataset is a multilingual instruction fine-tuning dataset curated by an open-science community via Aya Annotation Platform from Cohere For AI. The dataset contains a total of 204k human-annotated prompt-completion pairs along with the demographics data of the annotators. This dataset can be used to train, finetune, and evaluate multilingual LLMs. +""" + +from pathlib import Path +from typing import List + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@misc{singh2024aya, + title={Aya Dataset: An Open-Access Collection for Multilingual Instruction Tuning}, + author={Shivalika Singh and Freddie Vargus and Daniel Dsouza and Börje F. Karlsson and Abinaya Mahendiran and Wei-Yin Ko and Herumb Shandilya and Jay Patel and Deividas Mataciunas and Laura OMahony and Mike Zhang and Ramith Hettiarachchi and Joseph Wilson and Marina Machado and Luisa Souza Moura and Dominik Krzemiński and Hakimeh Fadaei and Irem Ergün and Ifeoma Okoh and Aisha Alaagib and Oshan Mudannayake and Zaid Alyafeai and Vu Minh Chien and Sebastian Ruder and Surya Guthikonda and Emad A. Alghamdi and Sebastian Gehrmann and Niklas Muennighoff and Max Bartolo and Julia Kreutzer and Ahmet Üstün and Marzieh Fadaee and Sara Hooker}, + year={2024}, + eprint={2402.06619}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_DATASETNAME = "aya_dataset" + +_DESCRIPTION = """\ +The Aya Dataset is a multilingual instruction fine-tuning dataset curated by an open-science community via Aya Annotation Platform from Cohere For AI. The dataset contains a total of 204k human-annotated prompt-completion pairs along with the demographics data of the annotators. This dataset can be used to train, finetune, and evaluate multilingual LLMs. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/CohereForAI/aya_dataset" + +_LANGUAGES = ["ceb", "ind", "jav", "mya", "tam", "tgl", "sun", "tha", "vie", "zsm"] + +_LICENSE = Licenses.APACHE_2_0.value + +_LOCAL = False + +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + "train": "https://huggingface.co/datasets/CohereForAI/aya_dataset/resolve/main/data/train-00000-of-00001.parquet", # test split does not contain SEA languages +} + +_SUPPORTED_TASKS = [Tasks.INSTRUCTION_TUNING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_SEACROWD_SCHEMA = "seacrowd_t2t" + + +def _aya_config_constructor(lang: str, schema: str, version: str) -> SEACrowdConfig: + return SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_{schema}", + version=version, + description=f"Aya Dataset {schema} schema", + schema=schema, + subset_id=f"Aya {lang}", + ) + + +class AyaDataset(datasets.GeneratorBasedBuilder): + """ + The Aya Dataset is a multilingual instruction fine-tuning dataset curated by an open-science community via Aya Annotation Platform from Cohere For AI. The dataset contains a total of 204k human-annotated prompt-completion pairs along with the demographics data of the annotators. This dataset can be used to train, finetune, and evaluate multilingual LLMs. + + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + def _populate_configs(): + configs = [_aya_config_constructor(lang, "source", _SOURCE_VERSION) for lang in _LANGUAGES] + [_aya_config_constructor(lang, _SEACROWD_SCHEMA, _SEACROWD_VERSION) for lang in _LANGUAGES] + + all_lang_source_config = SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=_SOURCE_VERSION, + description="Aya Dataset source schema", + schema="source", + subset_id="Aya", + ) + + all_lang_t2t_config = SEACrowdConfig( + name=f"{_DATASETNAME}_{_SEACROWD_SCHEMA}", + version=_SEACROWD_VERSION, + description=f"Aya Dataset {_SEACROWD_SCHEMA} schema", + schema=_SEACROWD_SCHEMA, + subset_id="Aya", + ) + + configs.append(all_lang_source_config) + configs.append(all_lang_t2t_config) + return configs + + BUILDER_CONFIGS = _populate_configs() + + DEFAULT_CONFIG_NAME = "aya_dataset_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "inputs": datasets.Value("string"), + "targets": datasets.Value("string"), + "language": datasets.Value("string"), + "language_code": datasets.Value("string"), + "annotation_type": datasets.Value("string"), + "user_id": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def get_lang_filter(self, config_name: str): + # aya_dataset_{lang}_{schema} + tokens = config_name.split("_") + if len(tokens) == 0 or len(tokens[2]) != 3: + return None + return tokens[2] + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + url = _URLS["train"] + data_dir = dl_manager.download_and_extract(url) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_path": Path(data_dir), + "split": "train", + }, + ), + ] + + def _generate_examples(self, data_path: Path, split: str): + """Yields examples as (key, example) tuples.""" + + df = pd.read_parquet(data_path) + + lang_filter = self.get_lang_filter(self.config.name) + if lang_filter is not None: + df = df[df["language_code"] == lang_filter] + else: + df = df[df["language_code"].isin(_LANGUAGES)] + + if self.config.schema == "source": + for idx, row in df.iterrows(): + data = row.to_dict() + yield idx, data + + elif self.config.schema == "seacrowd_t2t": + for idx, row in df.iterrows(): + sample = { + "id": str(idx), + "text_1": row["inputs"], + "text_2": row["targets"], + "text_1_name": "inputs", + "text_2_name": "targets", + } + yield idx, sample diff --git a/seacrowd/sea_datasets/bactrian_x/__init__.py b/seacrowd/sea_datasets/bactrian_x/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bactrian_x/bactrian_x.py b/seacrowd/sea_datasets/bactrian_x/bactrian_x.py new file mode 100644 index 000000000..da99f28f3 --- /dev/null +++ b/seacrowd/sea_datasets/bactrian_x/bactrian_x.py @@ -0,0 +1,153 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA, SCHEMA_TO_FEATURES + +_CITATION = """\ +@misc{li2023bactrianx, + title={Bactrian-X : A Multilingual Replicable Instruction-Following Model with Low-Rank Adaptation}, + author={Haonan Li and Fajri Koto and Minghao Wu and Alham Fikri Aji and Timothy Baldwin}, + year={2023}, + eprint={2305.15011}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_DATASETNAME = "bactrian_x" + +_DESCRIPTION = """\ +The Bactrain-X dataset is a collection of 3.4M instruction-response pairs in 52 +languages, that are obtained by translating 67K English instructions (alpaca-52k ++ dolly-15k) into 51 languages using Google Translate API. The translated +instructions are then fed to ChatGPT (gpt-3.5-turbo) to obtain its natural +responses, resulting in 3.4M instruction-response pairs in 52 languages (52 +languages x 67k instances = 3.4M instances). Human evaluations were conducted to +evaluate response quality for several languages, with those of interest to +SEACrowd being Burmese and Tagalog. +""" + +_HOMEPAGE = "https://github.com/mbzuai-nlp/Bactrian-X" + +_LANGUAGES = ["mya", "tgl", "ind", "khm", "tha", "vie"] + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_LOCAL = False + +_BASE_URL = "https://huggingface.co/datasets/MBZUAI/Bactrian-X/resolve/main/data/{subset}.json.gz?download=true" +_SUBSETS = ["my", "tl", "id", "km", "th", "vi"] + +_SUPPORTED_TASKS = [Tasks.INSTRUCTION_TUNING] +_SEACROWD_SCHEMA = f"seacrowd_{TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower()}" # t2t + +_SOURCE_VERSION = "1.0.1" + +_SEACROWD_VERSION = "1.0.0" + + +class BactrianXDataset(datasets.GeneratorBasedBuilder): + """A collection of translated instruction-response pairs, evaluated with ChatGPT and human.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [] + for subset in _SUBSETS: + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} {subset} source schema", + schema="source", + subset_id=subset, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_{_SEACROWD_SCHEMA}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {subset} SEACrowd schema", + schema=_SEACROWD_SCHEMA, + subset_id=subset, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_id_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "instruction": datasets.Value("string"), + "input": datasets.Value("string"), + "id": datasets.Value("string"), + "output": datasets.Value("string"), + } + ) + elif self.config.schema == _SEACROWD_SCHEMA: + features = SCHEMA_TO_FEATURES[ + TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]] + ] # text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_url = _BASE_URL.format(subset=self.config.name.split("_")[2]) + data_path = Path(dl_manager.download_and_extract(data_url)) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_path": data_path, + }, + ) + ] + + def _generate_examples(self, data_path: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(data_path, "r", encoding="utf-8") as file: + data = json.load(file) + + if self.config.schema == "source": + for idx, example in enumerate(data): + yield idx, { + "instruction": example["instruction"], + "input": example["input"], + "id": example["id"], + "output": example["output"], + } + elif self.config.schema == _SEACROWD_SCHEMA: + for idx, example in enumerate(data): + yield idx, { + "id": example["id"], + "text_1": f"Instruction: {example['instruction']}\nInput: {example['input']}", + "text_2": example["output"], + "text_1_name": "instruction + input", + "text_2_name": "output", + } diff --git a/seacrowd/sea_datasets/balita_nlp/__init__.py b/seacrowd/sea_datasets/balita_nlp/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/balita_nlp/balita_nlp.py b/seacrowd/sea_datasets/balita_nlp/balita_nlp.py new file mode 100644 index 000000000..a7e8797a7 --- /dev/null +++ b/seacrowd/sea_datasets/balita_nlp/balita_nlp.py @@ -0,0 +1,229 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{bunagtransformer, + author={Bunag, Kenrick Lance T and Esquivel, Rosanna A} + title={Transformer-Based Conditional Language Models to Generate Filipino News Articles}, + year = {2023}, + publisher = {IEOM Society International}, + url = {https://ieomsociety.org/proceedings/2023manila/595.pdf}, + booktitle = {Proceedings of the International Conference on Industrial Engineering and Operations Management}, + pages = {2231–2237}, + numpages = {7}, + location = {Manila, Philippines}, +} +""" + +_DATASETNAME = "balita_nlp" + +_DESCRIPTION = """\ +BalitaNLP is a dataset for image-conditional language generation and text-conditional image generation. It consists of 300k Filipino news +articles and images gathered from Filipino news outlets. News articles are categorized into five possible classes: News, Sports, Entertainment, +Crime, and Other. Some articles were removed from the SEACrowd `imtext` schema, as their corresponding image files do not exist: +- `train` split (262480 total articles): from the original 281403 articles, 18923 (~6.72%) had missing images +- `test` split (32821 total articles): from the original 35177 articles, 2356 (~6.70%) had missing images +- `validation` split (32806 total articles): from the original 35175 articles, 2369 (~6.73%) had missing images +""" + +_HOMEPAGE = "https://github.com/KenrickLance/BalitaNLP-Dataset" + +_LANGUAGES = ["fil"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + "text": "https://storage.googleapis.com/public-kenricklancebunag/BalitaNLP/2022/BalitaNLP-Dataset.zip", + "images": { + "part1": "https://storage.googleapis.com/public-kenricklancebunag/BalitaNLP/2022/BalitaNLP-images_1.zip", + "part2": "https://storage.googleapis.com/public-kenricklancebunag/BalitaNLP/2022/BalitaNLP-images_2.zip", + "part3": "https://storage.googleapis.com/public-kenricklancebunag/BalitaNLP/2022/BalitaNLP-images_3.zip", + "part4": "https://storage.googleapis.com/public-kenricklancebunag/BalitaNLP/2022/BalitaNLP-images_4.zip", + }, +} + +_SUPPORTED_TASKS = [Tasks.IMAGE_CAPTIONING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class BalitaNLPDataset(datasets.GeneratorBasedBuilder): + """ + BalitaNLP is an image-text dataset from https://github.com/KenrickLance/BalitaNLP-Dataset. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_imtext", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_imtext", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "body": datasets.Sequence(datasets.Value("string")), + "title": datasets.Value("string"), + "website": datasets.Value("string"), + "category": datasets.Value("string"), + "date": datasets.Value("string"), + "author": datasets.Value("string"), + "url": datasets.Value("string"), + "img_url": datasets.Value("string"), + "img_path": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_imtext": + features = schemas.image_text_features() + features["metadata"] = { + "context": datasets.Value("string"), + "author": datasets.Value("string"), + "category": datasets.Value("string"), + "date": datasets.Value("string"), + "img_url": datasets.Value("string"), + "url": datasets.Value("string"), + "website": datasets.Value("string"), + } + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + text_path = dl_manager.download_and_extract(_URLS["text"]) + img_paths = dl_manager.download_and_extract([v for k, v in _URLS["images"].items()]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "text_path": os.path.join(text_path, "train.json"), + "img_paths": img_paths, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "text_path": os.path.join(text_path, "test.json"), + "img_paths": img_paths, + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "text_path": os.path.join(text_path, "validation.json"), + "img_paths": img_paths, + "split": "validation", + }, + ), + ] + + def _generate_examples(self, text_path: Path, img_paths: Path, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + text_data = pd.read_json(text_path) + data = text_data.to_records() + + for idx, row in enumerate(data): + + # Search for path of image file + img_path = "" + for idx_subpath, img_subpath in enumerate(img_paths): + candidate_filepath = os.path.join(img_subpath, "part" + str(idx_subpath + 1), row["img_path"]) + if os.path.isfile(candidate_filepath): + img_path = candidate_filepath + + if self.config.schema == "source": + x = { + "body": row["body"], + "title": row["title"], + "website": row["website"], + "category": row["category"], + "date": row["date"], + "author": row["author"], + "url": row["url"], + "img_url": row["img_url"], + "img_path": img_path, + } + yield idx, x + + elif self.config.schema == "seacrowd_imtext": + + # Remove examples with no existing image path + if img_path == "": + continue + + x = { + "id": idx, + "image_paths": [img_path], + "texts": row["title"], + "metadata": { + "context": row["body"], + "author": row["author"], + "category": row["category"], + "date": row["date"], + "img_url": row["img_url"], + "url": row["url"], + "website": row["website"], + }, + } + yield idx, x + + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") diff --git a/seacrowd/sea_datasets/beaye_lexicon/__init__.py b/seacrowd/sea_datasets/beaye_lexicon/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py b/seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py new file mode 100644 index 000000000..01695249f --- /dev/null +++ b/seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py @@ -0,0 +1,116 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses + +_CITATION = """\ +@misc{beayelexicon2024, + author = {Lopo, Joanito Agili and Moeljadi, David and Cahyawijaya, Samuel and Aji, Alham Fikri and Sommerlot, + Carly J. and Jacob, June}, + title = {Penyusunan Korpus Paralel Bahasa Indonesia–Bahasa Melayu Ambon, Melayu Kupang, Beaye, dan Uab Meto}, + year = {2024}, + howpublished = {Online}, + url = {https://github.com/joanitolopo/makalah-kongresxii}, + note = {Manuscript in preparation}, +} +""" + +_DATASETNAME = "beaye_lexicon" +_DESCRIPTION = """The Beaye Lexicon is a lexicon resource encompassing translations between Indonesian, English, and +Beaye words. Developed through a collaborative effort involving two native Beaye speakers and evaluated by linguistic +experts, this lexicon comprises 984 Beaye vocabularies. The creation of the Beaye Lexicon marks the inaugural effort in +documenting the previously unrecorded Beaye language.""" + +_HOMEPAGE = "https://github.com/joanitolopo/bhinneka-korpus/tree/main/lexicon" +_LICENSE = Licenses.APACHE_2_0.value +_URLS = "https://raw.githubusercontent.com/joanitolopo/bhinneka-korpus/main/lexicon" +_SUPPORTED_TASKS = [] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" +_LOCAL = False + +_LANGUAGES = ["ind", "day", "eng"] + +class BeayeLexicon(datasets.GeneratorBasedBuilder): + """Beaye Lexicon is a lexicon resource encompassing translations between Indonesian, English, and Beaye words""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"beaye lexicon with source schema for {lang} language", + schema="source", + subset_id="beaye_lexicon", + ) + for lang in _LANGUAGES if lang != "eng" + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_ext_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"beaye lexicon with source schema for extensive definiton of beaye language", + schema="source", + subset_id="beaye_lexicon", + ) + for lang in _LANGUAGES if lang != "ind" + ] + ) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_ind_source" + + def _info(self) -> datasets.DatasetInfo: + schema = self.config.schema + if schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "word": datasets.Value("string")}) + else: + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + if "ext" in self.config.name.split("_"): + data_dir = Path(dl_manager.download(_URLS + "/english.xlsx")) + else: + data_dir = Path(dl_manager.download(_URLS + "/lexicon.xlsx")) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train", + } + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + dfs = pd.read_excel(filepath, engine="openpyxl") + if "ext" in self.config.name.split("_"): + lang = self.config.name.split("_")[3] + else: + lang = self.config.name.split("_")[2] + + text = dfs[lang] + + if self.config.schema == "source": + for idx, word in enumerate(text.values): + row = {"id": str(idx), "word": word} + yield idx, row + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/belebele/__init__.py b/seacrowd/sea_datasets/belebele/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bhinneka_korpus/__init__.py b/seacrowd/sea_datasets/bhinneka_korpus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py b/seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py new file mode 100644 index 000000000..d9e255eea --- /dev/null +++ b/seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py @@ -0,0 +1,139 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks +from seacrowd.utils import schemas + +_CITATION = """\ +@misc{lopo2024constructing, + title={Constructing and Expanding Low-Resource and Underrepresented Parallel Datasets for Indonesian Local Languages}, + author={Joanito Agili Lopo and Radius Tanone}, + year={2024}, + eprint={2404.01009}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_DATASETNAME = "bhinneka_korpus" +_DESCRIPTION = """The Bhinneka Korpus dataset was parallel dataset for five Indonesian Local Languages conducted +through a volunteer-driven translation strategy, encompassing sentences in the Indonesian-English pairs and lexical +terms. The dataset consist of parallel data with 16,000 sentences in total, details with 4,000 sentence pairs for two +Indonesia local language and approximately 3,000 sentences for other languages, and one lexicon dataset creation for +Beaye language. In addition, since beaye is a undocumented language, we don't have any information yet about the use +of language code. Therefore, we used "day" (a code for land dayak language family) to represent the language.""" + +_HOMEPAGE = "https://github.com/joanitolopo/bhinneka-korpus" +_LICENSE = Licenses.APACHE_2_0.value +_URLS = "https://raw.githubusercontent.com/joanitolopo/bhinneka-korpus/main/" +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +_LANGUAGES = ["abs", "aoz", "day", "mak", "mkn"] +LANGUAGES_TO_FILENAME_MAP = { + "abs": "ambonese-malay", + "aoz": "uab-meto", + "day": "beaye", + "mak": "makassarese", + "mkn": "kupang-malay", +} + + +class BhinnekaKorpusDataset(datasets.GeneratorBasedBuilder): + """A Collection of Multilingual Parallel Datasets for 5 Indonesian Local Languages.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "t2t" + + dataset_names = sorted([f"{_DATASETNAME}_{lang}" for lang in _LANGUAGES]) + BUILDER_CONFIGS = [] + for name in dataset_names: + source_config = SEACrowdConfig( + name=f"{name}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=name + ) + BUILDER_CONFIGS.append(source_config) + seacrowd_config = SEACrowdConfig( + name=f"{name}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=name + ) + BUILDER_CONFIGS.append(seacrowd_config) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_day_source" + + def _info(self) -> datasets.DatasetInfo: + schema = self.config.schema + features = datasets.Features( + { + "source_sentence": datasets.Value("string"), + "target_sentence": datasets.Value("string"), + "source_lang": datasets.Value("string"), + "target_lang": datasets.Value("string") + } if schema == "source" else schemas.text2text_features + if schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}" else None + ) + if features is None: + raise ValueError("Invalid config schema") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_dir = [] + lang = self.config.name.split("_")[2] + if lang in _LANGUAGES: + data_dir.append(Path(dl_manager.download(_URLS + f"{LANGUAGES_TO_FILENAME_MAP[lang]}/{lang}.xlsx"))) + else: + raise ValueError("Invalid language name") + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir[0], + "split": "train", + "language": lang + } + ) + ] + + def _generate_examples(self, filepath: Path, split: str, language: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + dfs = pd.read_excel(filepath, index_col=0, engine="openpyxl") + source_sents = dfs["ind"] + target_sents = dfs[language] + + for idx, (source, target) in enumerate(zip(source_sents.values, target_sents.values)): + if self.config.schema == "source": + example = { + "source_sentence": source, + "target_sentence": target, + "source_lang": "ind", + "target_lang": language + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = { + "id": str(idx), + "text_1": source, + "text_2": target, + "text_1_name": "ind", + "text_2_name": language, + } + yield idx, example diff --git a/seacrowd/sea_datasets/bioner_id/__init__.py b/seacrowd/sea_datasets/bioner_id/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bioner_id/bioner_id.py b/seacrowd/sea_datasets/bioner_id/bioner_id.py new file mode 100644 index 000000000..edd59cf6c --- /dev/null +++ b/seacrowd/sea_datasets/bioner_id/bioner_id.py @@ -0,0 +1,166 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.common_parser import load_conll_data +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{abdillah2023pengenalan, + title={Pengenalan Entitas Biomedis dalam Teks Konsultasi Kesehatan Online Berbahasa Indonesia Berbasis Arsitektur Transformers}, + author={Abdillah, Abid Famasya and Purwitasari, Diana and Juanita, Safitri and Purnomo, Mauridhi Hery}, + year={2023}, + month=feb, + journal={Jurnal Teknologi Informasi dan Ilmu Komputer}, + volume={10}, + number={1}, + pages={131--140} +} +""" + +_DATASETNAME = "bioner_id" + +_DESCRIPTION = """\ +This dataset taken from online health consultation platform Alodokter.com which has been annotated by two medical doctors. Data were annotated using IOB in CoNLL format. + +Dataset contains 2600 medical answers by doctors from 2017-2020. Two medical experts were assigned to annotate the data into two entity types: DISORDERS and ANATOMY. +The topics of answers are: diarrhea, HIV-AIDS, nephrolithiasis and TBC, which marked as high-risk dataset from WHO. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/abid/indonesia-bioner-dataset" + +_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.BSD_3_CLAUSE_CLEAR.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: {k: f"https://huggingface.co/datasets/abid/indonesia-bioner-dataset/raw/main/{k}.conll" for k in ["train", "valid", "test"]}, +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class BioNERIdDataset(datasets.GeneratorBasedBuilder): + """2600 conversations of patioent and medical doctors between 2017-2020. + Two medical annotated the data into two entity types: DISORDERS and ANATOMY""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + label_classes = ["B-ANAT", "B-DISO", "I-ANAT", "I-DISO", "O"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "sentence": [datasets.Value("string")], + "label": [datasets.Value("string")], + } + ) + + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(self.label_classes) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_paths = dl_manager.download(urls) + + # def _assert_data(msg): + # cur_data = list(map( + # lambda d: d.split(" "), + # open(fp, "r", encoding="utf8").readlines() + # )) + # assert {1, 4} == set(map(len, cur_data)), msg # length of 4 is due to uncommon delimiter of " _ _ " + # assert {('_', '_')} == set(map(lambda _: (_[1], _[2]), filter(lambda _: len(_) == 4, cur_data))), msg + + # Convert to tab-seperated value + for subset in ["train", "valid", "test"]: + fp = data_paths[subset] + # _assert_data(f"Invalid file for subset '{subset}'") + data = open(fp, "r", encoding="utf8").read() + # data_paths[subset] = f"{fp}.tsv" + open(data_paths[subset], "w", encoding="utf8").write(data.replace(" _ _ ", "\t")) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_paths["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_paths["test"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_paths["valid"]}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data = load_conll_data(filepath) + + if self.config.schema == "source": + for key, ex in enumerate(data): + yield key, ex + + elif self.config.schema == "seacrowd_seq_label": + for key, ex in enumerate(data): + yield key, { + "id": str(key), + "tokens": ex["sentence"], + "labels": ex["label"], + } diff --git a/seacrowd/sea_datasets/bloom_captioning/__init__.py b/seacrowd/sea_datasets/bloom_captioning/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bloom_captioning/bloom_captioning.py b/seacrowd/sea_datasets/bloom_captioning/bloom_captioning.py new file mode 100644 index 000000000..4833ec672 --- /dev/null +++ b/seacrowd/sea_datasets/bloom_captioning/bloom_captioning.py @@ -0,0 +1,248 @@ +""" +SEA Crowd Data Loader for Bloom Captioning. +""" +from typing import Dict, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = r""" +@inproceedings{leong-etal-2022-bloom, + title = "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks", + author = "Leong, Colin and + Nemecek, Joshua and + Mansdorfer, Jacob and + Filighera, Anna and + Owodunni, Abraham and + Whitenack, Daniel", + editor = "Goldberg, Yoav and + Kozareva, Zornitsa and + Zhang, Yue", + booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2022", + address = "Abu Dhabi, United Arab Emirates", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.emnlp-main.590", + doi = "10.18653/v1/2022.emnlp-main.590", + pages = "8608--8621", +} +""" + +logger = datasets.logging.get_logger(__name__) + +# this config is created for SEACrowd Dataloader +_LANG_CONFIG = { + "abc": "Ambala Ayta", + "ahk": "Akha", + "bfn": "Bunak", + "bjn": "Banjar", + "bkx": "Baikeno", + "brb": "Brao", + "brv": "Western Bru", + "bya": "Batak", + "bzi": "Bisu", + "ceb": "Cebuano", + "cgc": "Kagayanen", + "cmo": "Central Mnong", + "ddg": "Fataluku", + "dmg": "Upper Kinabatangan", + "dnw": "Western Dani", + "dtp": "Kadazan Dusun", + "dtr": "Lotud", + "enc": "En", + "fil": "Filipino", + "gal": "Galolen", + "hil": "Hiligaynon", + "hre": "Hre", + "hro": "Haroi", + "idt": "Idaté", + "ilo": "Ilocano", + "ind": "Indonesian", + "jra": "Jarai", + "kak": "Kalanguya", + "khb": "Lü", + "khm": "Khmer", + "kqr": "Kimaragang", + "krr": "Krung", + "ksw": "S’gaw Karen", + "lhu": "Lahu", + "llg": "Lole", + "lsi": "Lacid", + "lwl": "Eastern Lawa", + "mdr": "Mandar", + "mgm": "Mambae", + "mhx": "Lhao Vo", + "mkz": "Makasae", + "mnw": "Mon", + "mqj": "Mamasa", + "mry": "Mandaya", + "msb": "Masbatenyo", + "mya": "Burmese", + "nod": "Northern Thai", + "nst": "Tangshang Naga", + "nxa": "Nauete", + "nxl": "South Nuaulu", + "pag": "Pangasinan", + "pce": "Ruching Palaung", + "pdu": "Kayan", + "pea": "Peranakan Indonesian", + "pmf": "Pamona", + "sea": "Semai", + "sgd": "Surigaonon", + "shn": "Shan", + "sml": "Central Sama", + "snl": "Sangil", + "tdt": "Tetun Dili", + "tet": "Tetun", + "tha": "Thai", + "tkd": "Tukudede", + "tnt": "Tontemboan", + "tom": "Tombulu", + "tpu": "Tampuan", + "vie": "Vietnamese", + "war": "Waray-Waray", + "wms": "Wambon", + "wnk": "Wanukaka", + "xmm": "Manado Malay", + "yet": "Yetfa", + "zlm": "Malay", +} + +_LOCAL = False +_LANGUAGES = list(_LANG_CONFIG.keys()) + + +_DATASETNAME = "bloom_captioning" +_DESCRIPTION = r""" +This is a Bloom Library dataset developed for the image captioning task. +It covers 74 languages indigenous to SEA overall, amounting to total data of 21K. +This dataset belongs to a CC license, where its datapoints has specific license attached to it. +Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/sil-ai/bloom-captioning and use huggingface-cli login for authentication. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/sil-ai/bloom-captioning" +_LICENSE = Licenses.CC.value + +_URL = "https://huggingface.co/datasets/sil-ai/bloom-captioning" +_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) + +_SUPPORTED_TASKS = [Tasks.IMAGE_CAPTIONING] +_SOURCE_VERSION = "0.1.0" +_SEACROWD_VERSION = "1.0.0" + +CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS] + + +def construct_configs_on_langs(languages: list = None) -> List[SEACrowdConfig]: + """ + The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided + languages or a default language, and returns the list. + + input: + languages (list, default None): The `languages` parameter is a list that specifies the languages for which the + configurations need to be constructed. If no languages are provided (value=None), the first value in language config + will be used. + output: + a list of `SEACrowdConfig` objects based on instantiated init variables + """ + + # set output var + config_list = [] + + # construct zipped arg for config instantiation + TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK)) + + # implement source schema + version, config_name_prefix = _SOURCE_VERSION, "source" + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for language code {_LANG}", + schema=f"{config_name_prefix}", + subset_id=_LANG, + ) + for _LANG in languages + ] + + # implement SEACrowd schema + version, config_name_prefix = _SEACROWD_VERSION, "seacrowd" + for task_obj, config_name_suffix in TASKS_AND_CONFIG_SUFFIX_PAIRS: + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}_{config_name_suffix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}", + schema=f"{config_name_prefix}_{config_name_suffix}", + subset_id=_LANG, + ) + for _LANG in languages + ] + return config_list + + +class BloomCaptioningDataset(datasets.GeneratorBasedBuilder): + """Bloom Captioning dataset, subsetted from https://huggingface.co/datasets/sil-ai/bloom-captioning""" + + # get all schema w/o lang arg + get all schema w/ lang arg + BUILDER_CONFIGS = construct_configs_on_langs(_LANGUAGES) + + def _info(self) -> datasets.DatasetInfo: + _config_schema_name = self.config.schema + logger.info(f"Received schema name: {self.config.schema}") + # source schema + if _config_schema_name == "source": + features = datasets.Features( + { + "image_id": datasets.Value("string"), + "image_url": datasets.Value("string"), + "caption": datasets.Value("string"), + "story_id": datasets.Value("string"), + "album_id": datasets.Value("string"), + "license": datasets.Value("string"), + "original_bloom_language_tag": datasets.Value("string"), + "index_in_story": datasets.Value("uint16"), + } + ) + + # image-text schema + elif _config_schema_name == "seacrowd_imtext": + features = schemas.image_text_features() + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + hf_dset_dict = datasets.load_dataset(_HF_REMOTE_REF, self.config.subset_id) + + return [datasets.SplitGenerator(name=datasets.Split(dset_key), gen_kwargs={"hf_dset": dset}) for dset_key, dset in hf_dset_dict.items() if dset.num_rows > 0] + + def _generate_examples(self, hf_dset) -> Tuple[int, Dict]: + _config_schema_name = self.config.schema + + _idx = 0 + for datapoints in hf_dset: + # the `_idx` will be generated manually since no `id` present in the dataset fulfill the purpose as primary key + if _config_schema_name == "source": + yield _idx, {colname: datapoints[colname] for colname in self.info.features} + + elif _config_schema_name == "seacrowd_imtext": + yield _idx, {"id": _idx, "image_paths": [datapoints["image_url"]], "texts": datapoints["caption"], "metadata": {"context": "", "labels": []}} + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + _idx += 1 diff --git a/seacrowd/sea_datasets/bloom_lm/__init__.py b/seacrowd/sea_datasets/bloom_lm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bloom_lm/bloom_lm.py b/seacrowd/sea_datasets/bloom_lm/bloom_lm.py new file mode 100644 index 000000000..0a7a59bce --- /dev/null +++ b/seacrowd/sea_datasets/bloom_lm/bloom_lm.py @@ -0,0 +1,247 @@ +""" +SEA Crowd Data Loader for Bloom LM. +""" +from typing import Dict, Iterator, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = r""" +@inproceedings{leong-etal-2022-bloom, + title = "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks", + author = "Leong, Colin and + Nemecek, Joshua and + Mansdorfer, Jacob and + Filighera, Anna and + Owodunni, Abraham and + Whitenack, Daniel", + editor = "Goldberg, Yoav and + Kozareva, Zornitsa and + Zhang, Yue", + booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2022", + address = "Abu Dhabi, United Arab Emirates", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.emnlp-main.590", + doi = "10.18653/v1/2022.emnlp-main.590", + pages = "8608--8621", +} +""" + +logger = datasets.logging.get_logger(__name__) + +# this config is created for SEACrowd Dataloader +_LANG_CONFIG = { + "abc": "Ambala Ayta", + "ahk": "Akha", + "bfn": "Bunak", + "bjn": "Banjar", + "bkx": "Baikeno", + "brb": "Brao", + "brv": "Western Bru", + "bya": "Batak", + "bzi": "Bisu", + "ceb": "Cebuano", + "cgc": "Kagayanen", + "cmo": "Central Mnong", + "ddg": "Fataluku", + "dmg": "Upper Kinabatangan", + "dnw": "Western Dani", + "dtp": "Kadazan Dusun", + "dtr": "Lotud", + "enc": "En", + "fil": "Filipino", + "gal": "Galolen", + "hil": "Hiligaynon", + "hre": "Hre", + "hro": "Haroi", + "idt": "Idaté", + "ilo": "Ilocano", + "ind": "Indonesian", + "jra": "Jarai", + "kak": "Kalanguya", + "khb": "Lü", + "khm": "Khmer", + "kqr": "Kimaragang", + "krr": "Krung", + "ksw": "S’gaw Karen", + "kvt": "Lahta", + "lao": "Lao", + "lhu": "Lahu", + "llg": "Lole", + "lsi": "Lacid", + "lwl": "Eastern Lawa", + "mdr": "Mandar", + "mgm": "Mambae", + "mhx": "Lhao Vo", + "mkz": "Makasae", + "mnw": "Mon", + "mqj": "Mamasa", + "mry": "Mandaya", + "msb": "Masbatenyo", + "mya": "Burmese", + "nod": "Northern Thai", + "nst": "Tangshang Naga", + "nxa": "Nauete", + "nxl": "South Nuaulu", + "pag": "Pangasinan", + "pce": "Ruching Palaung", + "pdu": "Kayan", + "pea": "Peranakan Indonesian", + "pmf": "Pamona", + "psp_ceb": "Filipino Sign Language", + "sea": "Semai", + "sgd": "Surigaonon", + "shn": "Shan", + "sml": "Central Sama", + "snl": "Sangil", + "tdt": "Tetun Dili", + "tet": "Tetun", + "tha": "Thai", + "tkd": "Tukudede", + "tnt": "Tontemboan", + "tom": "Tombulu", + "tpu": "Tampuan", + "vie": "Vietnamese", + "war": "Waray-Waray", + "wms": "Wambon", + "wnk": "Wanukaka", + "xmm": "Manado Malay", + "yet": "Yetfa", + "yin": "Riang Lai", + "zlm": "Malay", +} + +_LOCAL = False +_LANGUAGES = list(_LANG_CONFIG.keys()) + +_DATASETNAME = "bloom_lm" +_DESCRIPTION = r""" +This is a Bloom Library dataset developed for the self-supervised language modeling task. +It covers 74 languages indigenous to SEA overall, amounting to total data of 21K. +This dataset belongs to a CC license, where its datapoints has specific license attached to it. +Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/sil-ai/bloom-lm and use huggingface-cli login for authentication. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/sil-ai/bloom-lm" +_LICENSE = Licenses.CC.value + +_URL = "https://huggingface.co/datasets/sil-ai/bloom-lm" +_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] +_SOURCE_VERSION = "0.1.0" +_SEACROWD_VERSION = "1.0.0" + +CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS] + + +def construct_configs_on_langs() -> List[SEACrowdConfig]: + """ + The function `construct_configs` constructs a list of SEACrowdConfig objects based on `_LANGUAGES` var, and returns the list. + + output: + a list of `SEACrowdConfig` objects based on instantiated init variables + """ + + # set output var + config_list = [] + + # construct zipped arg for config instantiation + TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK)) + + # implement source schema + version, config_name_prefix = _SOURCE_VERSION, "source" + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for language code {_LANG}", + schema=f"{config_name_prefix}", + # since the actual subset_id in source for "psp_ceb" is "psp", we are defining the subset_id as following for loading to source HF + subset_id=_LANG if _LANG != "psp_ceb" else "psp", + ) + for _LANG in _LANGUAGES + ] + + # implement SEACrowd schema + version, config_name_prefix = _SEACROWD_VERSION, "seacrowd" + for task_obj, config_name_suffix in TASKS_AND_CONFIG_SUFFIX_PAIRS: + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}_{config_name_suffix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}", + schema=f"{config_name_prefix}_{config_name_suffix}", + # since the actual subset_id in source for "psp_ceb" is "psp", we are defining the subset_id as following for loading to source HF + subset_id=_LANG if _LANG != "psp_ceb" else "psp", + ) + for _LANG in _LANGUAGES + ] + return config_list + + +class BloomLMDataset(datasets.GeneratorBasedBuilder): + """Bloom LM dataset, subsetted from https://huggingface.co/datasets/sil-ai/bloom-lm""" + + # get all schema w/o lang arg + get all schema w/ lang arg + BUILDER_CONFIGS = construct_configs_on_langs() + + def _info(self) -> datasets.DatasetInfo: + _config_schema_name = self.config.schema + logger.info(f"Received schema name: {self.config.schema}") + # source schema + if _config_schema_name == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "title": datasets.Value("string"), + "license": datasets.Value("string"), + "copyright": datasets.Value("string"), + "pageCount": datasets.Value("int32"), + "bookInstanceId": datasets.Value("string"), + "bookLineage": datasets.Value("string"), + } + ) + + # ssp schema + elif _config_schema_name == "seacrowd_ssp": + features = schemas.ssp_features + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + hf_dset_dict = datasets.load_dataset(_HF_REMOTE_REF, self.config.subset_id) + + return [datasets.SplitGenerator(name=datasets.Split(dset_key), gen_kwargs={"hf_dset": dset}) for dset_key, dset in hf_dset_dict.items() if dset.num_rows > 0] + + def _generate_examples(self, hf_dset) -> Iterator[Tuple[int, Dict]]: + _config_schema_name = self.config.schema + + _idx = 0 + for datapoints in hf_dset: + # the `_idx` will be generated manually since no `id` present in the dataset fulfill the purpose as primary key + if _config_schema_name == "source": + yield _idx, {colname: datapoints[colname] for colname in self.info.features} + + elif _config_schema_name == "seacrowd_ssp": + yield _idx, {"id": _idx, "text": datapoints["text"]} + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + _idx += 1 diff --git a/seacrowd/sea_datasets/bloom_speech/__init__.py b/seacrowd/sea_datasets/bloom_speech/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bloom_speech/bloom_speech.py b/seacrowd/sea_datasets/bloom_speech/bloom_speech.py new file mode 100644 index 000000000..873cc577b --- /dev/null +++ b/seacrowd/sea_datasets/bloom_speech/bloom_speech.py @@ -0,0 +1,172 @@ +""" +SEA Crowd Data Loader for Bloom Speech. +""" +from typing import Dict, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = r""" +@inproceedings{leong-etal-2022-bloom, + title = "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks", + author = "Leong, Colin and + Nemecek, Joshua and + Mansdorfer, Jacob and + Filighera, Anna and + Owodunni, Abraham and + Whitenack, Daniel", + editor = "Goldberg, Yoav and + Kozareva, Zornitsa and + Zhang, Yue", + booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2022", + address = "Abu Dhabi, United Arab Emirates", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.emnlp-main.590", + doi = "10.18653/v1/2022.emnlp-main.590", + pages = "8608--8621", +} +""" + +logger = datasets.logging.get_logger(__name__) + +# this config is created for SEACrowd Dataloader +_LANG_CONFIG = {"bjn": "Banjar", "bzi": "Bisu", "ceb": "Cebuano", "ind": "Indonesian", "jra": "Jarai", "kqr": "Kimaragang", "mya": "Burmese", "tgl": "Tagalog"} + +_LOCAL = False +_LANGUAGES = list(_LANG_CONFIG.keys()) + + +_DATASETNAME = "bloom_speech" +_DESCRIPTION = r""" +This version of the Bloom Library data is developed specifically for the automatic speech recognition and speech-to-text tasks. +It includes data from 56 languages across 18 language families. 8 languages are spoken in Southeast Asia. +Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/sil-ai/bloom-speech and use huggingface-cli login for authentication. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/sil-ai/bloom-speech" +_LICENSE = Licenses.CC.value + +_URL = "https://huggingface.co/datasets/sil-ai/bloom-speech" +_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] +_SOURCE_VERSION = "0.0.1" +_SEACROWD_VERSION = "1.0.0" + +CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS] + + +def construct_configs_on_langs(languages: list = None) -> List[SEACrowdConfig]: + """ + The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided + languages or a default language, and returns the list. + + input: + languages (list, default None): The `languages` parameter is a list that specifies the languages for which the + configurations need to be constructed. If no languages are provided (value=None), the first value in language config + will be used. + output: + a list of `SEACrowdConfig` objects based on instantiated init variables + """ + + # set output var + config_list = [] + + # construct zipped arg for config instantiation + TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK)) + + # implement source schema + version, config_name_prefix = _SOURCE_VERSION, "source" + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for language code {_LANG}", + schema=f"{config_name_prefix}", + subset_id=_LANG, + ) + for _LANG in languages + ] + + # implement SEACrowd schema + version, config_name_prefix = _SEACROWD_VERSION, "seacrowd" + for task_obj, config_name_suffix in TASKS_AND_CONFIG_SUFFIX_PAIRS: + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}_{config_name_suffix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}", + schema=f"{config_name_prefix}_{config_name_suffix}", + subset_id=_LANG, + ) + for _LANG in languages + ] + return config_list + + +class BloomSpeechDataset(datasets.GeneratorBasedBuilder): + """Bloom Speech dataset, subsetted from https://huggingface.co/datasets/sil-ai/bloom-speech""" + + # get all schema w/o lang arg + get all schema w/ lang arg + BUILDER_CONFIGS = construct_configs_on_langs(_LANGUAGES) + + def _info(self) -> datasets.DatasetInfo: + _config_schema_name = self.config.schema + logger.info(f"Received schema name: {self.config.schema}") + # source schema + if _config_schema_name == "source": + features = datasets.Features( + { + "file": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "text": datasets.Value("string"), + "book": datasets.Value("string"), + "instance": datasets.Value("string"), + "license": datasets.Value("string"), + "credits": datasets.Value("string"), + "original_lang_tag": datasets.Value("string"), + } + ) + + # speech-text schema + elif _config_schema_name == "seacrowd_sptext": + features = schemas.speech_text_features + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + hf_dset_dict = datasets.load_dataset(_HF_REMOTE_REF, self.config.subset_id) + + return [datasets.SplitGenerator(name=datasets.Split(dset_key), gen_kwargs={"hf_dset": dset}) for dset_key, dset in hf_dset_dict.items() if dset.num_rows > 0] + + def _generate_examples(self, hf_dset) -> Tuple[int, Dict]: + _config_schema_name = self.config.schema + + _idx = 0 + for datapoints in hf_dset: + # since no _idx is available to be used, we're creating it manually for both schema + if _config_schema_name == "source": + yield _idx, {colname: datapoints[colname] for colname in self.info.features} + + elif _config_schema_name == "seacrowd_sptext": + yield _idx, {"id": _idx, "path": datapoints["file"], "audio": datapoints["audio"], "text": datapoints["text"], "speaker_id": None, "metadata": {"speaker_age": None, "speaker_gender": None}} + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + _idx += 1 diff --git a/seacrowd/sea_datasets/bloom_vist/__init__.py b/seacrowd/sea_datasets/bloom_vist/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bloom_vist/bloom_vist.py b/seacrowd/sea_datasets/bloom_vist/bloom_vist.py new file mode 100644 index 000000000..959097ff3 --- /dev/null +++ b/seacrowd/sea_datasets/bloom_vist/bloom_vist.py @@ -0,0 +1,262 @@ +""" +SEA Crowd Data Loader for Bloom VIST. +""" +from typing import Dict, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = r""" +@inproceedings{leong-etal-2022-bloom, + title = "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks", + author = "Leong, Colin and + Nemecek, Joshua and + Mansdorfer, Jacob and + Filighera, Anna and + Owodunni, Abraham and + Whitenack, Daniel", + editor = "Goldberg, Yoav and + Kozareva, Zornitsa and + Zhang, Yue", + booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2022", + address = "Abu Dhabi, United Arab Emirates", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.emnlp-main.590", + doi = "10.18653/v1/2022.emnlp-main.590", + pages = "8608--8621", +} +""" + +logger = datasets.logging.get_logger(__name__) + +# this config is created for SEACrowd Dataloader +_LANG_CONFIG = { + "abc": "Ambala Ayta", + "ahk": "Akha", + "bfn": "Bunak", + "bjn": "Banjar", + "bkx": "Baikeno", + "brb": "Brao", + "brv": "Western Bru", + "bya": "Batak", + "bzi": "Bisu", + "ceb": "Cebuano", + "cgc": "Kagayanen", + "cmo": "Central Mnong", + "ddg": "Fataluku", + "dmg": "Upper Kinabatangan", + "dnw": "Western Dani", + "dtp": "Kadazan Dusun", + "enc": "En", + "fil": "Filipino", + "hil": "Hiligaynon", + "hro": "Haroi", + "idt": "Idaté", + "ilo": "Ilocano", + "ind": "Indonesian", + "jra": "Jarai", + "kak": "Kalanguya", + "khb": "Lü", + "khm": "Khmer", + "kqr": "Kimaragang", + "krr": "Krung", + "ksw": "S’gaw Karen", + "lhu": "Lahu", + "lsi": "Lacid", + "lwl": "Eastern Lawa", + "mdr": "Mandar", + "mgm": "Mambae", + "mhx": "Lhao Vo", + "mkz": "Makasae", + "mry": "Mandaya", + "msb": "Masbatenyo", + "mya": "Burmese", + "nod": "Northern Thai", + "nxa": "Nauete", + "nxl": "South Nuaulu", + "pag": "Pangasinan", + "pce": "Ruching Palaung", + "pea": "Peranakan Indonesian", + "pmf": "Pamona", + "psp": "Filipino Sign Language", + "sea": "Semai", + "sgd": "Surigaonon", + "sml": "Central Sama", + "snl": "Sangil", + "tdt": "Tetun Dili", + "tet": "Tetun", + "tha": "Thai", + "tkd": "Tukudede", + "tpu": "Tampuan", + "war": "Waray-Waray", + "wms": "Wambon", + "yet": "Yetfa", + "yin": "Riang Lai", + "zlm": "Malay", +} + +_LOCAL = False +_LANGUAGES = list(_LANG_CONFIG.keys()) + + +_DATASETNAME = "bloom_vist" +_DESCRIPTION = r""" +BLOOM VIST is a visual storytelling of books that consists of 62 languages indigenous to SEA. +This dataset is owned by Bloom, a free, open-source software developed by SIL International and associated with Bloom Library, app, and services. +This dataset is released with the LICENSE family of Creative Commons (although each story datapoints has its licensing in more detail, +e.g cc-by, cc-by-nc, cc-by-nd, cc-by-sa, cc-by-nc-nd, cc-by-nc-sa). +Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/sil-ai/bloom-vist and use huggingface-cli login for authentication. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/sil-ai/bloom-vist" +_LICENSE = Licenses.CC.value + +_URL = "https://huggingface.co/datasets/sil-ai/bloom-vist" +_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) + +_SUPPORTED_TASKS = [Tasks.IMAGE_CAPTIONING] +_SOURCE_VERSION = "0.1.0" +_SEACROWD_VERSION = "1.0.0" + +CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS] + + +def conform_init_config(): + """Assertion Function for Instantiated Configs""" + if len(_LANGUAGES) == 0: + raise AssertionError("No Languages detected from config!") + if len(CONFIG_SUFFIXES_FOR_TASK) != len(_SUPPORTED_TASKS): + raise AssertionError("Config prefixes don't matched in terms of `len` with `_SUPPORTED_TASKS`!") + if len(CONFIG_SUFFIXES_FOR_TASK) == 0: + raise AssertionError("Config prefixes and `_SUPPORTED_TASKS` have `len` of 0!") + + +conform_init_config() + + +def construct_configs_on_langs(languages: list = None) -> List[SEACrowdConfig]: + """ + The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided + languages or a default language, and returns the list. + + input: + languages (list, default None): The `languages` parameter is a list that specifies the languages for which the + configurations need to be constructed. If no languages are provided (value=None), the first value in language config + will be used. + output: + a list of `SEACrowdConfig` objects based on instantiated init variables + """ + + # set output var + config_list = [] + + # construct zipped arg for config instantiation + TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK)) + + # implement source schema + version, config_name_prefix = _SOURCE_VERSION, "source" + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for language code {_LANG}", + schema=f"{config_name_prefix}", + subset_id=_LANG, + ) + for _LANG in languages + ] + + # implement SEACrowd schema + version, config_name_prefix = _SEACROWD_VERSION, "seacrowd" + for task_obj, config_name_suffix in TASKS_AND_CONFIG_SUFFIX_PAIRS: + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}_{config_name_suffix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}", + schema=f"{config_name_prefix}_{config_name_suffix}", + subset_id=_LANG, + ) + for _LANG in languages + ] + return config_list + + +class BloomVISTDataset(datasets.GeneratorBasedBuilder): + """Bloom VIST dataset, subsetted from https://huggingface.co/datasets/sil-ai/bloom-vist""" + + # get all schema w/o lang arg + get all schema w/ lang arg + BUILDER_CONFIGS = construct_configs_on_langs(_LANGUAGES) + + def _info(self) -> datasets.DatasetInfo: + _config_schema_name = self.config.schema + logger.info(f"Received schema name: {self.config.schema}") + # source schema + if _config_schema_name == "source": + features = datasets.Features( + { + "title": datasets.Value("string"), + "license": datasets.Value("string"), + "album_id": datasets.Value("string"), + "story": datasets.Sequence( + feature={"image_id": datasets.Value("string"), "image_url": datasets.Value("string"), "story_index": datasets.Value("int32"), "story_id": datasets.Value("string"), "text": datasets.Value("string")}, length=-1, id=None + ), + } + ) + + # image-text schema + elif _config_schema_name == "seacrowd_imtext": + features = schemas.image_text_features() + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + hf_dset_dict = datasets.load_dataset(_HF_REMOTE_REF, self.config.subset_id) + + return [datasets.SplitGenerator(name=datasets.Split(dset_key), gen_kwargs={"hf_dset": dset}) for dset_key, dset in hf_dset_dict.items() if dset.num_rows > 0] + + def _generate_examples(self, hf_dset) -> Tuple[int, Dict]: + _config_schema_name = self.config.schema + + _idx = 0 + for datapoints in hf_dset: + # for source schema, the `_idx` will be taken from "album_id" value + if _config_schema_name == "source": + yield datapoints["album_id"], {colname: datapoints[colname] for colname in self.info.features} + + # for seacrowd schema, the `_idx` will be created manually + # since one album_id has multiple pairs of image-text + elif _config_schema_name == "seacrowd_imtext": + # check the len of the features in sequenced columns + # since in source hf there's no validation on data integrity + _len_vars = [] + _ftrs_in_seq = ("image_id", "image_url", "story_index", "story_id", "text") + story_data = datapoints["story"] + for ftr in _ftrs_in_seq: + _len_vars.append(len(story_data[ftr])) + + # skip story w/ mismatched infos + if max(_len_vars) != min(_len_vars): + continue + + for num_data in range(max(_len_vars)): + yield _idx, {"id": _idx, "image_paths": [story_data["image_url"][num_data]], "texts": story_data["text"][num_data], "metadata": {"context": datapoints["title"], "labels": []}} + _idx += 1 + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") diff --git a/seacrowd/sea_datasets/burapha_th/__init__.py b/seacrowd/sea_datasets/burapha_th/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/burapha_th/burapha_th.py b/seacrowd/sea_datasets/burapha_th/burapha_th.py new file mode 100644 index 000000000..f71bae77a --- /dev/null +++ b/seacrowd/sea_datasets/burapha_th/burapha_th.py @@ -0,0 +1,167 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@Article{app12084083, +AUTHOR = {Onuean, Athita and Buatoom, Uraiwan and Charoenporn, Thatsanee and Kim, Taehong and Jung, Hanmin}, +TITLE = {Burapha-TH: A Multi-Purpose Character, Digit, and Syllable Handwriting Dataset}, +JOURNAL = {Applied Sciences}, +VOLUME = {12}, +YEAR = {2022}, +NUMBER = {8}, +ARTICLE-NUMBER = {4083}, +URL = {https://www.mdpi.com/2076-3417/12/8/4083}, +ISSN = {2076-3417}, +DOI = {10.3390/app12084083} +} +""" +_DATASETNAME = "burapha_th" + +_DESCRIPTION = """\ +The dataset has 68 character classes, 10 digit classes, and 320 syllable classes. +For constructing the dataset, 1072 Thai native speakers wrote on collection datasheets +that were then digitized using a 300 dpi scanner. +De-skewing, detection box and segmentation algorithms were applied to the raw scans +for image extraction. The dataset, unlike all other known Thai handwriting datasets, retains +existing noise, the white background, and all artifacts generated by scanning. +""" + +_HOMEPAGE = "https://services.informatics.buu.ac.th/datasets/Burapha-TH/" + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False +_LANGUAGES = ["tha"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_URLS = { + "character": {"test": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/character/20210306-test.zip", "train": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/character/20210306-train.zip"}, + "digit": {"test": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/digit/20210307-test.zip", "train": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/digit/20210307-train.zip"}, + "syllable": {"test": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/syllable/20210309-test-ori.zip", "train": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/syllable/20210309-train-ori.zip"}, +} + +_SUPPORTED_TASKS = [Tasks.IMAGE_CAPTIONING] +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_SUBSETS = ["character", "digit", "syllable"] + + +def config_constructor(subset: str, schema: str, version: str) -> SEACrowdConfig: + return SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_{schema}", + version=version, + description=f"{_DATASETNAME} {subset} {schema} schema", + schema=f"{schema}", + subset_id=f"{_DATASETNAME}_{subset}", + ) + + +class BuraphaThDataset(datasets.GeneratorBasedBuilder): + """ + The dataset has 68 character classes, 10 digit classes, and 320 syllable classes. + For constructing the dataset, 1072 Thai native speakers wrote on collection datasheets + that were then digitized using a 300 dpi scanner. + De-skewing, detection box and segmentation algorithms were applied to the raw scans for + image extraction. The dataset, unlike all other known Thai handwriting datasets, retains + existing noise, the white background, and all artifacts generated by scanning. + """ + + BUILDER_CONFIGS = [config_constructor(subset, "source", _SOURCE_VERSION) for subset in _SUBSETS] + BUILDER_CONFIGS.extend([config_constructor(subset, "seacrowd_imtext", _SEACROWD_VERSION) for subset in _SUBSETS]) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_digit_source" + + label_chr_dig = [str(i).zfill(2) for i in range(78)] + label_syl = [str(i).zfill(3) for i in range(320)] + + def _info(self) -> datasets.DatasetInfo: + task = self.config.subset_id.split("_")[2] + if self.config.schema == "source": + features = datasets.Features( + {"id": datasets.Value("string"), "image_paths": datasets.Value("string"), "label": datasets.Sequence(datasets.ClassLabel(names=self.label_chr_dig if task == "character" or task == "digit" else self.label_syl))} + ) + elif self.config.schema == "seacrowd_imtext": + features = schemas.image_text_features(label_names=self.label_chr_dig if task == "character" or task == "digit" else self.label_syl) + else: + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + task = self.config.subset_id.split("_")[2] + + _local_path = dl_manager.download_and_extract(_URLS[task]) + train_path, test_path = _local_path["train"], _local_path["test"] + if task in ["character", "digit"]: + train_path = os.path.join(train_path, "train") + test_path = os.path.join(test_path, "test") + # for "syllable" type task + else: + train_path = os.path.join(train_path, "train-ori") + test_path = os.path.join(test_path, "test-ori") + + data_pair = {} + + for dir_name in os.listdir(train_path): + dir_name_split = dir_name.split("-") + file_names = [] + + for file_name in os.listdir(os.path.join(train_path, dir_name)): + file_names.append(os.path.join(train_path, dir_name, file_name)) + + label = dir_name_split[0] + data_pair[label] = file_names + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_pair, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_pair, + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + task = self.config.subset_id.split("_")[2] + counter = 0 + + for key, imgs in filepath.items(): + for img in imgs: + if self.config.schema == "source": + yield counter, {"id": str(counter), "image_paths": img, "label": [self.label_chr_dig.index(key) if task == "character" or task == "digit" else self.label_syl.index(key)]} + elif self.config.schema == "seacrowd_imtext": + yield counter, { + "id": str(counter), + "image_paths": [img], + "texts": None, + "metadata": { + "context": None, + "labels": [self.label_chr_dig.index(key) if task in ["character", "digit"] else self.label_syl.index(key)], + }, + } + counter += 1 diff --git a/seacrowd/sea_datasets/cc100/cc100.py b/seacrowd/sea_datasets/cc100/cc100.py index bc9aae0b1..494628052 100644 --- a/seacrowd/sea_datasets/cc100/cc100.py +++ b/seacrowd/sea_datasets/cc100/cc100.py @@ -29,7 +29,6 @@ [seacrowd_schema_name] = ssp """ -from posixpath import split from typing import Dict, List, Tuple import datasets @@ -37,13 +36,14 @@ from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, - DEFAULT_SOURCE_VIEW_NAME, Tasks) + DEFAULT_SOURCE_VIEW_NAME, Tasks, TASK_TO_SCHEMA) _DATASETNAME = "cc100" _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME _UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME -_LANGUAGES = ["ind", "jav", "sun"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +# We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LANGUAGES = ["ind", "jav", "sun", "mya", "mya_zaw", "lao", "khm", "tgl", "vie", "tha", "zlm"] _LOCAL = False _CITATION = """\ @@ -135,9 +135,17 @@ _LICENSE = "MIT" _LANGUAGES_MAP = { - "ind": "id", - "jav": "jv", - "sun": "su", + "ind": "id", # Indonesian + "jav": "jv", # Javanese + "sun": "su", # Sundanese + "mya": "my", # Burmese + "mya_zaw": "my_zaw", # Burmese (Zawgyi) + "lao": "lo", # Lao + "khm": "km", # Central Khmer, Khmer + "tgl": "tl", # Tagalog + "vie": "vi", # Vietnamese + "tha": "th", # Thai + "zlm": "ms", # Malay } _URLS = { @@ -146,17 +154,26 @@ _SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] +_SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + _SOURCE_VERSION = "2018.12.01" _SEACROWD_VERSION = "1.0.0" + def seacrowd_config_constructor(lang, schema, version): """Construct SEACrowdConfig with cc100_{lang}_{schema} as the name format.""" - if schema != "source" and schema != "seacrowd_ssp": + if schema != "source" and schema != f"seacrowd_{_SEACROWD_SCHEMA_NAME}": raise ValueError(f"Invalid schema: {schema}") if lang == "": - raise ValueError(f"Language is required. Choose one of these languages: {_LANGUAGES}.") + return SEACrowdConfig( + name=f"cc100_{schema}", + version=datasets.Version(version), + description=f"CC100 with {schema} schema for all languages", + schema=schema, + subset_id="cc100", + ) elif lang in _LANGUAGES: return SEACrowdConfig( name=f"cc100_{lang}_{schema}", @@ -171,14 +188,15 @@ def seacrowd_config_constructor(lang, schema, version): class CC100(datasets.GeneratorBasedBuilder): """Monolingual Datasets from Web Crawl Data.""" - - DEFAULT_CONFIG_NAME = "cc100_jav_source" - - BUILDER_CONFIGS = [ - seacrowd_config_constructor(lang, "source", _SOURCE_VERSION) for lang in _LANGUAGES_MAP - ] + [ - seacrowd_config_constructor(lang, "seacrowd_ssp", _SEACROWD_VERSION) for lang in _LANGUAGES_MAP - ] + + BUILDER_CONFIGS = ( + [seacrowd_config_constructor(lang, "source", _SOURCE_VERSION) for lang in _LANGUAGES_MAP] + + [seacrowd_config_constructor(lang, f"seacrowd_{_SEACROWD_SCHEMA_NAME}", _SEACROWD_VERSION) for lang in _LANGUAGES_MAP] + + [ + seacrowd_config_constructor("", "source", _SOURCE_VERSION), + seacrowd_config_constructor("", f"seacrowd_{_SEACROWD_SCHEMA_NAME}", _SOURCE_VERSION), + ] + ) def _info(self) -> datasets.DatasetInfo: if self.config.schema == "source": @@ -188,7 +206,7 @@ def _info(self) -> datasets.DatasetInfo: "text": datasets.Value("string"), } ) - elif self.config.schema == "seacrowd_ssp": + elif self.config.schema == f"seacrowd_{_SEACROWD_SCHEMA_NAME}": features = schemas.self_supervised_pretraining.features return datasets.DatasetInfo( @@ -201,14 +219,13 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - split_name = self.config.name.split("_") - if split_name[1] == "source" or split_name[1] == "seacrowd": - lang = _DEFAULT_LANGUAGE + if self.config.name == "cc100_source" or self.config.name == f"cc100_seacrowd_{_SEACROWD_SCHEMA_NAME}": + # Load all languages + path = dl_manager.download_and_extract([_URLS["train"].format(lang=_LANGUAGES_MAP[lang]) for lang in _LANGUAGES_MAP]) else: - lang = split_name[1] - url = _URLS["train"].format(lang=_LANGUAGES_MAP[lang]) - path = dl_manager.download_and_extract(url) + url = _URLS["train"].format(lang=_LANGUAGES_MAP[split_name[1]]) + path = dl_manager.download_and_extract(url) return [ datasets.SplitGenerator( @@ -234,7 +251,7 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: "text": row.strip(), }, ) - elif self.config.schema == "seacrowd_ssp": + elif self.config.schema == f"seacrowd_{_SEACROWD_SCHEMA_NAME}": for counter, row in enumerate(f): if row.strip() != "": yield ( @@ -243,4 +260,4 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: "id": str(counter), "text": row.strip(), }, - ) \ No newline at end of file + ) diff --git a/seacrowd/sea_datasets/cc3m_35l/__init__.py b/seacrowd/sea_datasets/cc3m_35l/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py b/seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py new file mode 100644 index 000000000..f7782297f --- /dev/null +++ b/seacrowd/sea_datasets/cc3m_35l/cc3m_35l.py @@ -0,0 +1,242 @@ +import os +from typing import Dict, List, Tuple + +import datasets +import jsonlines as jl +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{thapliyal-etal-2022-crossmodal, + title = "Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset", + author = "Thapliyal, Ashish V. and + Pont Tuset, Jordi and + Chen, Xi and + Soricut, Radu", + editor = "Goldberg, Yoav and + Kozareva, Zornitsa and + Zhang, Yue", + booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2022", + address = "Abu Dhabi, United Arab Emirates", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.emnlp-main.45", + doi = "10.18653/v1/2022.emnlp-main.45", + pages = "715--729", +} +""" + +_DATASETNAME = "cc3m_35l" + +_DESCRIPTION = """\ + CC3M-35L is created by translating Conceptual Captions 3M (Sharma et al., 2018), + originally in English, to the other 34 languages using Google's machine translation API. +""" + +_HOMEPAGE = "https://google.github.io/crossmodal-3600/" + +_LICENSE = Licenses.CC_BY_4_0.value + +# the image URLs are contained in tsv file together with the original captions which can be downloaded locally using google account. +# those tsv file originally can be found and downloaded from this page https://ai.google.com/research/ConceptualCaptions/download +# there are no direct image folder ready, so it needs to be downloaded one by one +# some warnings may occur when downloading due to reasons such as security certificate and others +_URLS = { + "trans_train": "https://storage.googleapis.com/crossmodal-3600/cc3m_mt_train.jsonl.gz", + "trans_dev": "https://storage.googleapis.com/crossmodal-3600/cc3m_mt_dev.jsonl.gz", +} + +_SUPPORTED_TASKS = [Tasks.IMAGE_CAPTIONING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_LANGUAGES = ["fil", "ind", "tha", "vie"] + +_LOCAL = True + + +class CC3M35L(datasets.GeneratorBasedBuilder): + """ + CC3M-35L is created by translating Conceptual Captions 3M (Sharma et al., 2018), + originally in English, to the other 34 languages using Google's machine translation API. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [SEACrowdConfig(name=f"cc3m_35l_{lang}_source", version=datasets.Version(_SOURCE_VERSION), description=f"cc3m_35l_{lang} source schema", schema="source", subset_id=f"cc3m_35l_{lang}",) for lang in _LANGUAGES] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_seacrowd_imtext", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME}_{lang} SEACrowd schema", + schema="seacrowd_imtext", + subset_id=f"{_DATASETNAME}_{lang}", + ) + for lang in _LANGUAGES + ] + + DEFAULT_CONFIG_NAME = "cc3m_35l_id_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "image_paths": datasets.Value("string"), + "src_lang": datasets.Value("string"), + "caption_tokenized": datasets.Value("string"), + "trg_lang": datasets.Value("string"), + "translation_tokenized": datasets.Value("string"), + "backtranslation_tokenized": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_imtext": + features = schemas.image_text_features() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def fill_img_path(self, df: pd.DataFrame, line: dict): + exceptions = [] + selected_row = df.query('caption==@line["caption_tokenized"]') + # it may return several rows, skip of empty + if not selected_row.empty: + # for each row, download the image, use its path and put the translation + for idx, row in selected_row.iterrows(): + row["trans_caption"] = line["translation_tokenized"] + row["backtrans_caption"] = line["backtranslation_tokenized"] + # if the image cannot be downloaded for some reason, skip it + # may cause difference in the total data each run + try: + row["img_path"] = datasets.DownloadManager().download(row["img_url"]) + except: + exceptions.append(idx) + + return selected_row, exceptions + + def is_target(self, line: dict, trg_lang: str): + if line["trg_lang"] == trg_lang: + return line + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + dev_path = dl_manager.download_and_extract(_URLS["trans_dev"]) + train_path = dl_manager.download_and_extract(_URLS["trans_train"]) + + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + # read tsv from local train and validation files + gcc_val = os.path.join(data_dir, "Validation_GCC-1.1.0-Validation.tsv") + gcc_train = os.path.join(data_dir, "Train_GCC-training.tsv") + + # make it into pandas dataframe + colnames = ["caption", "img_url"] + gcc_val_df = pd.read_csv(gcc_val, sep="\t", header=None, names=colnames) + gcc_train_df = pd.read_csv(gcc_train, sep="\t", header=None, names=colnames) + + # add new column to keep the downloaded image path + gcc_val_df["img_path"] = None + gcc_train_df["img_path"] = None + + # add new column to keep the translated caption + gcc_val_df["trans_caption"] = None + gcc_train_df["trans_caption"] = None + + gcc_val_df["backtrans_caption"] = None + gcc_train_df["backtrans_caption"] = None + + # match the original captions in the translated set to the dataframe caption + # download the images from the URL and use it as the filepath + train_exceptions = [] + val_exceptions = [] + + current_lang = self.config.subset_id.split("_")[2] + val_caption_targets = [] + train_caption_targets = [] + + # filter validation data + with jl.open(os.path.join(dev_path), mode="r") as j: + val_caption_targets = [line for line in j if line["trg_lang"] == current_lang] + + #for line in val_caption_targets[:100]: # this was for debugging only + for line in val_caption_targets: + res = self.fill_img_path(gcc_train_df, line) + val_exceptions.extend(res[1]) + gcc_val_df.update(res[0]) + + # clean the memory + val_caption_targets = [] + + # filter train data + with jl.open(os.path.join(train_path), mode="r") as j: + train_caption_targets = [line for line in j if line["trg_lang"] == current_lang] + + + #for line in train_caption_targets[:100]: # this was for debugging only + for line in train_caption_targets: + res = self.fill_img_path(gcc_val_df, line) + train_exceptions.extend(res[1]) + gcc_train_df.update(res[0]) + + # clean the memory + train_caption_targets = [] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": gcc_train_df, + "exceptions": train_exceptions, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": gcc_val_df, + "exceptions": val_exceptions, + }, + ), + ] + + def _generate_examples(self, filepath: dict, exceptions: list) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + for idx, row in filepath.iterrows(): + if idx not in exceptions: + if self.config.schema == "source": + yield idx, { + "id": str(idx), + "image_paths": row["img_path"], + "src_lang": "en", + "caption_tokenized": row["caption"], + "trg_lang": self.config.subset_id.split("_")[2], + "translation_tokenized": row["trans_caption"], + "backtranslation_tokenized": row["backtrans_caption"], + } + + elif self.config.schema == "seacrowd_imtext": + yield idx, { + "id": str(idx), + "image_paths": [row["img_path"]], + "texts": row["trans_caption"], + "metadata": { + "context": None, + "labels": None, + }, + } + + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/cc_aligned_doc/__init__.py b/seacrowd/sea_datasets/cc_aligned_doc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/cc_aligned_doc/cc_aligned_doc.py b/seacrowd/sea_datasets/cc_aligned_doc/cc_aligned_doc.py new file mode 100644 index 000000000..4f0463178 --- /dev/null +++ b/seacrowd/sea_datasets/cc_aligned_doc/cc_aligned_doc.py @@ -0,0 +1,154 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{elkishky_ccaligned_2020, + author = {El-Kishky, Ahmed and Chaudhary, Vishrav and Guzm{\'a}n, Francisco and Koehn, Philipp}, + booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP 2020)}, + month = {November}, + title = {{CCAligned}: A Massive Collection of Cross-lingual Web-Document Pairs}, + year = {2020} + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/2020.emnlp-main.480", + doi = "10.18653/v1/2020.emnlp-main.480", + pages = "5960--5969" +} +""" + +_DATASETNAME = "cc_aligned_doc" + +_DESCRIPTION = """\ +CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English\ +(10 languages are from Southeast Asia; Burmese has two document collection with different scripts).\ +These web-document pairs were constructed by performing language identification on raw web-documents, \ +and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern \ +matching approach yielded more than 100 million aligned documents paired with English. +""" + +_HOMEPAGE = "https://www2.statmt.org/cc-aligned/" + +_LANGUAGES = ["ind", "sun", "tha", "vie", "zlm", "lao", "khm", "mya", "ceb", "war"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False +_SUBSETS = {"id_ID": "ind", "su_ID": "sun", "th_TH": "tha", "vi_VN": "vie", "ms_MY": "zlm", "lo_LA": "lao", "km_KH": "khm", "my_MM": "mya", "my_MM_zaw": "mya", "cx_PH": "ceb", "wy_PH": "war"} +_URLS = {_DATASETNAME: "https://data.statmt.org/cc-aligned/en_XX-{subset}.tsv.xz"} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class CCAlignedDocDataset(datasets.GeneratorBasedBuilder): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "t2t" + + BUILDER_CONFIGS = [SEACrowdConfig(name=f"{_DATASETNAME}_{subset}_source", version=datasets.Version(_SOURCE_VERSION), description=f"{_DATASETNAME} source schema", schema="source", subset_id=f"{_DATASETNAME}",) for subset in _SUBSETS.keys()] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_seacrowd_{schema_name}", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{schema_name}", + subset_id=f"{_DATASETNAME}", + ) + for subset, schema_name in zip(_SUBSETS.keys(), len(_SUBSETS.keys()) * [SEACROWD_SCHEMA_NAME]) + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_id_ID_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "Domain": datasets.Value("string"), + "Source_URL": datasets.Value("string"), + "Source_Content": datasets.Value("string"), + "Target_URL": datasets.Value("string"), + "Target_Content": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + subset = "_".join([self.config.name.split("_")[3], self.config.name.split("_")[4]]) + urls = _URLS[_DATASETNAME].format(subset=subset) + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + subset = "_".join([self.config.name.split("_")[3], self.config.name.split("_")[4]]) + lines = open(filepath, "r").readlines() + if self.config.schema == "source": + idx = 0 + for line in lines: + content = line.split("\t") + example = { + "Domain": content[0], + "Source_URL": content[1], + "Source_Content": content[2], + "Target_URL": content[3], + "Target_Content": content[4], + } + yield idx, example + idx += 1 + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + idx = 0 + for line in lines: + content = line.split("\t") + example = { + "id": str(idx), + "text_1": content[2], + "text_2": content[4], + "text_1_name": "en", + "text_2_name": _SUBSETS[subset], + } + yield idx, example + idx += 1 diff --git a/seacrowd/sea_datasets/cc_aligned_sent/__init__.py b/seacrowd/sea_datasets/cc_aligned_sent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/cc_aligned_sent/cc_aligned_sent.py b/seacrowd/sea_datasets/cc_aligned_sent/cc_aligned_sent.py new file mode 100644 index 000000000..0e70878b7 --- /dev/null +++ b/seacrowd/sea_datasets/cc_aligned_sent/cc_aligned_sent.py @@ -0,0 +1,167 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = r""" +@inproceedings{chaudhary-etal-2019-low, + title = "Low-Resource Corpus Filtering Using Multilingual Sentence Embeddings", + author = "Chaudhary, Vishrav and + Tang, Yuqing and + Guzm{\'a}n, Francisco and + Schwenk, Holger and + Koehn, Philipp", + editor = "Bojar, Ond{\v{r}}ej and + Chatterjee, Rajen and + Federmann, Christian and + Fishel, Mark and + Graham, Yvette and + Haddow, Barry and + Huck, Matthias and + Yepes, Antonio Jimeno and + Koehn, Philipp and + Martins, Andr{\'e} and + Monz, Christof and + Negri, Matteo and + N{\'e}v{\'e}ol, Aur{\'e}lie and + Neves, Mariana and + Post, Matt and + Turchi, Marco and + Verspoor, Karin", + booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 3: Shared Task Papers, Day 2)", + month = aug, + year = "2019", + address = "Florence, Italy", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/W19-5435", + doi = "10.18653/v1/W19-5435", + pages = "261--266", +} +""" + +_LOCAL = False +_LANGUAGES = ["ind", "jav", "sun", "tha", "vie", "zlm", "lao", "khm", "mya", "ceb"] +_DATASETNAME = "cc_aligned_sent" +_DESCRIPTION = """\ +This dataset contains the sentence pairs extracted from CC-Aligned document +pairs using similarity scores of LASER embeddings (minimum similarity 1.04, +sorted based on decreasing similarity score). It misses some languages not +covered by LASER. +""" + +_HOMEPAGE = "https://www2.statmt.org/cc-aligned/" +_LICENSE = Licenses.UNKNOWN.value +_URL = "https://data.statmt.org/cc-aligned/sentence-aligned/" + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +_SUBSETS = ["id_ID", "jv_ID", "su_ID", "th_TH", "vi_VN", "ms_MY", "lo_LA", "km_KH", "my_MM", "cx_PH"] + + +class CCAlignedSentencesDataset(datasets.GeneratorBasedBuilder): + """CC Aligned Sentences dataset by Chaudhary et al., (2019)""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "t2t" + + # Add configurations for loading a dataset per language. + dataset_names = sorted([f"{_DATASETNAME}_{subset}" for subset in _SUBSETS]) + BUILDER_CONFIGS = [] + for name in dataset_names: + source_config = SEACrowdConfig( + name=f"{name}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=name, + ) + BUILDER_CONFIGS.append(source_config) + seacrowd_config = SEACrowdConfig( + name=f"{name}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=name, + ) + BUILDER_CONFIGS.append(seacrowd_config) + + # Choose first language as default + first_subset = sorted(_SUBSETS)[0] + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{first_subset}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "Source_Sentence": datasets.Value("string"), + "Target_Sentence": datasets.Value("string"), + "LASER_similarity": datasets.Value("float64"), + } + ) + + if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_to_text.features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + """Return SplitGenerators.""" + # Define some functions for parsing config and URL names + def _split_at_n(text: str, n: int) -> Tuple[str, str]: + """Split text on the n-th instance""" + return ("_".join(text.split("_")[:n]), "_".join(text.split("_")[n:])) + + # Get URL. For cx_PH, the source and target languages are reversed + _, subset = _split_at_n(_split_at_n(self.config.name, 5)[0], 3) + (source_lang, target_lang) = (subset, "en_XX") if subset == "cx_PH" else ("en_XX", subset) + url = _URL + f"{source_lang}-{target_lang}.tsv.xz" + filepath = dl_manager.download_and_extract(url) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": filepath, + "source_lang": source_lang, + "target_lang": target_lang, + }, + ) + ] + + def _generate_examples(self, filepath: Path, source_lang: str, target_lang: str) -> Tuple[int, Dict]: + """Yield examples as (key, example) tuples""" + with open(filepath, encoding="utf-8") as file: + for idx, row in enumerate(file): + text_1, text_2, score = row.strip().split("\t") + if self.config.schema == "source": + example = { + "id": idx, + "Source_Sentence": text_1, + "Target_Sentence": text_2, + "LASER_similarity": float(score), + } + if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = { + "id": idx, + "text_1": text_1, + "text_2": text_2, + "text_1_name": source_lang, + "text_2_name": target_lang, + } + yield idx, example diff --git a/seacrowd/sea_datasets/coco_35l/__init__.py b/seacrowd/sea_datasets/coco_35l/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/coco_35l/coco_35l.py b/seacrowd/sea_datasets/coco_35l/coco_35l.py new file mode 100644 index 000000000..78770aea3 --- /dev/null +++ b/seacrowd/sea_datasets/coco_35l/coco_35l.py @@ -0,0 +1,228 @@ +import json +import os +from typing import Dict, List, Tuple + +# import csv +import datasets +import jsonlines as jl +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{thapliyal-etal-2022-crossmodal, + title = "Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset", + author = "Thapliyal, Ashish V. and + Pont Tuset, Jordi and + Chen, Xi and + Soricut, Radu", + editor = "Goldberg, Yoav and + Kozareva, Zornitsa and + Zhang, Yue", + booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2022", + address = "Abu Dhabi, United Arab Emirates", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.emnlp-main.45", + doi = "10.18653/v1/2022.emnlp-main.45", + pages = "715--729", +} +""" + +_DATASETNAME = "coco_35l" + +_DESCRIPTION = """\ + COCO-35L is a machine-generated image caption dataset, constructed by translating COCO Captions (Chen et al., 2015) to the other 34 languages using Google’s machine translation API. + 152520 image ids are not found in the coco 2014 training caption. Validation set is ok Using COCO 2014 train and validation set. + """ + +_HOMEPAGE = "https://google.github.io/crossmodal-3600/" + +_LICENSE = Licenses.CC_BY_4_0.value + +_URLS = { + "coco2017_train_images": "http://images.cocodataset.org/zips/train2017.zip", + "coco2014_train_images": "http://images.cocodataset.org/zips/train2014.zip", + "coco2014_val_images": "http://images.cocodataset.org/zips/val2014.zip", + "coco2014_train_val_annots": "http://images.cocodataset.org/annotations/annotations_trainval2014.zip", + "coco2017_train_val_annots": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip", + "trans_train": "https://storage.googleapis.com/crossmodal-3600/coco_mt_train.jsonl.gz", + "trans_dev": "https://storage.googleapis.com/crossmodal-3600/coco_mt_dev.jsonl.gz", +} + +_SUPPORTED_TASKS = [Tasks.IMAGE_CAPTIONING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_LANGUAGES = {"fil": "fil", "ind": "id", "tha": "th", "vie": "vi"} + +class Coco35LDataset(datasets.GeneratorBasedBuilder): + """ + COCO-35L is a machine-generated image caption dataset, constructed by translating COCO Captions (Chen et al., 2015) to the other 34 languages using Google’s machine translation API. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME}_{lang} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{lang}", + ) for lang in _LANGUAGES + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_seacrowd_imtext", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME}_{lang} SEACrowd schema", + schema="seacrowd_imtext", + subset_id=f"{_DATASETNAME}_{lang}", + ) for lang in _LANGUAGES + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{sorted(_LANGUAGES)[0]}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "image_paths": datasets.Value("string"), + "src_lang": datasets.Value("string"), + "caption_tokenized": datasets.Value("string"), + "trg_lang": datasets.Value("string"), + "translation_tokenized": datasets.Value("string"), + "backtranslation_tokenized": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_imtext": + features = schemas.image_text_features() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + trans_train_path = dl_manager.download_and_extract(_URLS["trans_train"]) + trans_val_path = dl_manager.download_and_extract(_URLS["trans_dev"]) + + coco2014_train_val_annots_path = dl_manager.download_and_extract(_URLS["coco2014_train_val_annots"]) + coco2014_val_images_path = dl_manager.download_and_extract(_URLS["coco2014_val_images"]) + coco2014_train_images_path = dl_manager.download_and_extract(_URLS["coco2014_train_images"]) + + trans_train_captions = {} + trans_dev_captions = {} + train_df = pd.DataFrame() + val_df = pd.DataFrame() + + current_lang = _LANGUAGES[self.config.subset_id.split("_")[2]] + + # the COCO dataset structure has separated the captions and images information. The caption's "image_id" key will refer to the image's "id" key. + # load the image informations from COCO 2014 dataset and put it into a dataframe + with open(os.path.join(coco2014_train_val_annots_path, "annotations", "captions_val2014.json")) as json_captions: + captions = json.load(json_captions) + val_df = pd.DataFrame(captions["images"]) + + with open(os.path.join(coco2014_train_val_annots_path, "annotations", "captions_train2014.json")) as json_captions: + captions = json.load(json_captions) + train_df = pd.DataFrame(captions["images"]) + + # the translated caption has "image_id" which refers to the "image_id" in the COCO annotations. + # however we can skip this and connect it to the images' "id" + # the example of an "image_id" in the translated caption -> "123456_0" since an image can has many descriptions. + # thus, the real image_id to map it into the COCO image dataset is the "123456" + with jl.open(trans_train_path, mode="r") as j: + total = 0 + not_found = 0 + missing_ids = [] + for line in j: + if line["trg_lang"] == current_lang: + total += 1 + + trans_img_id = line["image_id"] + coco2014_img_id = line["image_id"].split("_")[0] + + # unfortunately, not all image_id in the translated caption can be found in the original COCO 2014. + # hence, we need to handle such errors + try: + filename = train_df.query(f"id=={int(coco2014_img_id)}")["file_name"].values[0] + trans_train_captions[trans_img_id] = line + trans_train_captions[trans_img_id]["filename"] = os.path.join(coco2014_train_images_path, "train2014", filename) + except IndexError: + missing_ids.append(trans_img_id) + not_found += 1 + pass + + # the validation set are strangely okay. with no missing image_id(s) + with jl.open(trans_val_path, mode="r") as j: + for line in j: + if line["trg_lang"] == current_lang: + trans_img_id = line["image_id"] + trans_dev_captions[trans_img_id] = line + coco2014_img_id = int(trans_img_id.split("_")[0]) + filename = val_df.query(f"id=={coco2014_img_id}")["file_name"].values[0] + trans_dev_captions[trans_img_id]["filename"] = os.path.join(coco2014_val_images_path, "val2014", filename) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": { + "images": trans_train_captions, + }, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": { + "images": trans_dev_captions, + }, + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: dict, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + counter = 0 + for trans_img_id, data in filepath["images"].items(): + if self.config.schema == "source": + yield counter, { + "id": trans_img_id + "_" + str(counter), + "image_paths": data["filename"], + "src_lang": data["src_lang"], + "caption_tokenized": data["caption_tokenized"], + "trg_lang": data["trg_lang"], + "translation_tokenized": data["translation_tokenized"], + "backtranslation_tokenized": data["backtranslation_tokenized"], + } + + elif self.config.schema == "seacrowd_imtext": + yield counter, { + "id": trans_img_id + "_" + str(counter), + "image_paths": [data["filename"]], + "texts": data["translation_tokenized"], + "metadata": { + "context": None, + "labels": None, + }, + } + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + counter += 1 diff --git a/seacrowd/sea_datasets/codeswitch_reddit/__init__.py b/seacrowd/sea_datasets/codeswitch_reddit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/codeswitch_reddit/codeswitch_reddit.py b/seacrowd/sea_datasets/codeswitch_reddit/codeswitch_reddit.py new file mode 100644 index 000000000..8b71f940b --- /dev/null +++ b/seacrowd/sea_datasets/codeswitch_reddit/codeswitch_reddit.py @@ -0,0 +1,209 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import html +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproccedings{rabinovich-2019-codeswitchreddit, + author = {Rabinovich, Ella and Sultani, Masih and Stevenson, Suzanne}, + title = {CodeSwitch-Reddit: Exploration of Written Multilingual Discourse in Online Discussion Forums}, + booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing}, + publisher = {Association for Computational Linguistics}, + year = {2019}, + url = {https://aclanthology.org/D19-1484}, + doi = {10.18653/v1/D19-1484}, + pages = {4776--4786}, +} +""" + +_LOCAL = False +_LANGUAGES = ["eng", "ind", "tgl"] +_DATASETNAME = "codeswitch_reddit" +_DESCRIPTION = """ +This corpus consists of monolingual English and multilingual (English and one other language) posts +from country-specific subreddits, including r/indonesia, r/philippines and r/singapore for Southeast Asia. +Posts were manually classified whether they contained code-switching or not. +""" + +_HOMEPAGE = "https://github.com/ellarabi/CodeSwitch-Reddit" +_LICENSE = Licenses.UNKNOWN.value +_URL = "http://www.cs.toronto.edu/~ella/code-switch.reddit.tar.gz" + +_SUPPORTED_TASKS = [Tasks.CODE_SWITCHING_IDENTIFICATION, Tasks.SELF_SUPERVISED_PRETRAINING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class CodeSwitchRedditDataset(datasets.GeneratorBasedBuilder): + """Dataset of monolingual English and multilingual comments from country-specific subreddits.""" + + SUBSETS = ["cs", "eng_monolingual"] + INCLUDED_SUBREDDITS = ["indonesia", "Philippines", "singapore"] + INCLUDED_LANGUAGES = {"English": "eng", "Indonesian": "ind", "Tagalog": "tgl"} + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for {subset} subset", + schema="source", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SUBSETS + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_eng_monolingual_seacrowd_ssp", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd ssp schema for eng_monolingual subset", + schema="seacrowd_ssp", + subset_id=f"{_DATASETNAME}_eng_monolingual", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_cs_seacrowd_text_multi", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd text multilabel schema for cs subset", + schema="seacrowd_text_multi", + subset_id=f"{_DATASETNAME}_cs", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_cs_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + if "cs" in self.config.subset_id: + features = datasets.Features( + { + "author": datasets.Value("string"), + "subreddit": datasets.Value("string"), + "country": datasets.Value("string"), + "date": datasets.Value("int32"), + "confidence": datasets.Value("int32"), + "lang1": datasets.Value("string"), + "lang2": datasets.Value("string"), + "text": datasets.Value("string"), + "id": datasets.Value("string"), + "link_id": datasets.Value("string"), + "parent_id": datasets.Value("string"), + } + ) + elif "eng_monolingual" in self.config.subset_id: + features = datasets.Features( + { + "author": datasets.Value("string"), + "subreddit": datasets.Value("string"), + "country": datasets.Value("string"), + "date": datasets.Value("int32"), + "confidence": datasets.Value("int32"), + "lang": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_ssp": + features = schemas.ssp_features + elif self.config.schema == "seacrowd_text_multi": + features = schemas.text_multi_features(label_names=list(self.INCLUDED_LANGUAGES.values())) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_dir = dl_manager.download_and_extract(_URL) + if "cs" in self.config.subset_id: + filepath = os.path.join(data_dir, "cs_main_reddit_corpus.csv") + elif "eng_monolingual" in self.config.subset_id: + filepath = os.path.join(data_dir, "eng_monolingual_reddit_corpus.csv") + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": filepath, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + df = pd.read_csv(filepath, index_col=None, header="infer", encoding="utf-8") + df = df[df["Subreddit"].isin(self.INCLUDED_SUBREDDITS)] + + if self.config.subset_id.split("_")[-1] == "cs": + df = df[(df["Lang1"].isin(self.INCLUDED_LANGUAGES)) & (df["Lang2"].isin(self.INCLUDED_LANGUAGES))] + df.reset_index(drop=True, inplace=True) + + for index, row in df.iterrows(): + parsed_text = html.unescape(row["Text"]) + if self.config.schema == "source": + example = { + "author": row["Author"], + "subreddit": row["Subreddit"], + "country": row["Country"], + "date": row["Date"], + "confidence": row["confidence"], + "lang1": row["Lang1"], + "lang2": row["Lang2"], + "text": parsed_text, + "id": row["id"], + "link_id": row["link_id"], + "parent_id": row["parent_id"], + } + + elif self.config.schema == "seacrowd_text_multi": + lang_one, lang_two = self.INCLUDED_LANGUAGES[row["Lang1"]], self.INCLUDED_LANGUAGES[row["Lang2"]] + example = { + "id": str(index), + "text": parsed_text, + "labels": list(sorted([lang_one, lang_two])), # Language order doesn't matter in original dataset; just arrange alphabetically for consistency + } + yield index, example + + else: + df.reset_index(drop=True, inplace=True) + for index, row in df.iterrows(): + parsed_text = html.unescape(row["Text"]) + if self.config.schema == "source": + example = { + "author": row["Author"], + "subreddit": row["Subreddit"], + "country": row["Country"], + "date": row["Date"], + "confidence": row["confidence"], + "lang": row["Lang"], + "text": parsed_text, + } + elif self.config.schema == "seacrowd_ssp": + example = { + "id": str(index), + "text": parsed_text, + } + yield index, example diff --git a/seacrowd/sea_datasets/commonvoice_120/__init__.py b/seacrowd/sea_datasets/commonvoice_120/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/commonvoice_120/commonvoice_120.py b/seacrowd/sea_datasets/commonvoice_120/commonvoice_120.py new file mode 100644 index 000000000..00305120f --- /dev/null +++ b/seacrowd/sea_datasets/commonvoice_120/commonvoice_120.py @@ -0,0 +1,208 @@ +# coding=utf-8 +import csv +import json +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{commonvoice:2020, + author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.}, + title = {Common Voice: A Massively-Multilingual Speech Corpus}, + booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)}, + pages = {4211--4215}, + year = 2020 +} +""" + +_DATASETNAME = "commonvoice_120" + +_DESCRIPTION = """\ +The Common Mozilla Voice dataset consists of a unique MP3 and corresponding text file. +Many of the 26119 recorded hours in the dataset also include demographic metadata like age, sex, and accent that can help improve the accuracy of speech recognition engines. +The dataset currently consists of 17127 validated hours in 104 languages, but more voices and languages are always added. + +Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/mozilla-foundation/common_voice_12_0 and use huggingface-cli login for authentication +""" + +_HOMEPAGE = "https://commonvoice.mozilla.org/en/datasets" + +_LANGUAGES = ["cnh", "ind", "tha", "vie"] +_LANG_TO_CVLANG = {"cnh": "cnh", "ind": "id", "tha": "th", "vie": "vi"} + +_AGE_TO_INT = {"": None, "teens": 10, "twenties": 20, "thirties": 30, "fourties": 40, "fifties": 50, "sixties": 60, "seventies": 70, "eighties": 80} + +_LICENSE = Licenses.CC0_1_0.value + +# Note: the dataset is gated in HuggingFace. It's public after providing access token +_LOCAL = False + +_COMMONVOICE_URL_TEMPLATE = "https://huggingface.co/datasets/mozilla-foundation/common_voice_12_0/resolve/main/" +_URLS = {"audio": _COMMONVOICE_URL_TEMPLATE + "audio/{lang}/{split}/{lang}_{split}_{shard_idx}.tar", "transcript": _COMMONVOICE_URL_TEMPLATE + "transcript/{lang}/{split}.tsv", "n_shards": _COMMONVOICE_URL_TEMPLATE + "n_shards.json"} + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION, Tasks.TEXT_TO_SPEECH] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class Commonvoice120(datasets.GeneratorBasedBuilder): + """This is the dataloader for CommonVoice 12.0 Mozilla""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + *[ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}{'_' if lang else ''}source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for {lang}", + schema="source", + subset_id=f"{_DATASETNAME}{'_' if lang else ''}{lang}", + ) + for lang in ["", *_LANGUAGES] + ], + *[ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}{'_' if lang else ''}seacrowd_sptext", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for {lang}", + schema="seacrowd_sptext", + subset_id=f"{_DATASETNAME}{'_' if lang else ''}{lang}", + ) + for lang in ["", *_LANGUAGES] + ], + ) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "client_id": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.features.Audio(sampling_rate=48_000), + "sentence": datasets.Value("string"), + "up_votes": datasets.Value("int64"), + "down_votes": datasets.Value("int64"), + "age": datasets.Value("string"), + "gender": datasets.Value("string"), + "accent": datasets.Value("string"), + "locale": datasets.Value("string"), + "segment": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_sptext": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + lang_code = self.config.subset_id.split("_")[-1] + languages = [_LANG_TO_CVLANG.get(lang, lang) for lang in (_LANGUAGES if lang_code == "120" else [lang_code])] + n_shards_path = dl_manager.download_and_extract(_URLS["n_shards"]) + with open(n_shards_path, encoding="utf-8") as f: + n_shards = json.load(f) + + audio_urls = {} + meta_urls = {} + splits = ("train", "dev", "test") + for split in splits: + audio_urls[split] = [_URLS["audio"].format(lang=lang, split=split, shard_idx=i) for lang in languages for i in range(n_shards[lang][split])] + meta_urls[split] = [_URLS["transcript"].format(lang=lang, split=split) for lang in languages] + archive_paths = dl_manager.download(audio_urls) + local_extracted_archive_paths = dl_manager.extract(archive_paths) + meta_paths = dl_manager.download_and_extract(meta_urls) + + split_names = { + "train": datasets.Split.TRAIN, + "dev": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } + return [ + datasets.SplitGenerator( + name=split_names.get(split, split), + gen_kwargs={ + "local_extracted_archive_paths": local_extracted_archive_paths.get(split), + "audio_archives": [dl_manager.iter_archive(path) for path in archive_paths.get(split)], + "meta_paths": meta_paths[split], + "split": "train", + }, + ) + for split in splits + ] + + def _generate_examples(self, local_extracted_archive_paths: [Path], audio_archives: [Path], meta_paths: [Path], split: str) -> Tuple[int, Dict]: + data_fields = list(self._info().features.keys()) + metadata = {} + for meta_path in meta_paths: + with open(meta_path, encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) + for row in reader: + if not row["path"].endswith(".mp3"): + row["path"] += ".mp3" + if "accents" in row: + row["accent"] = row["accents"] + del row["accents"] + for field in data_fields: + if field not in row: + row[field] = "" + metadata[row["path"]] = row + + if self.config.schema == "source": + for i, audio_archive in enumerate(audio_archives): + for path, file in audio_archive: + _, filename = os.path.split(path) + if filename in metadata: + src_result = dict(metadata[filename]) + path = os.path.join(local_extracted_archive_paths[i], path) + result = { + "client_id": src_result["client_id"], + "path": path, + "audio": {"path": path, "bytes": file.read()}, + "sentence": src_result["sentence"], + "up_votes": src_result["up_votes"], + "down_votes": src_result["down_votes"], + "age": src_result["age"], + "gender": src_result["gender"], + "accent": src_result["accent"], + "locale": src_result["locale"], + "segment": src_result["segment"], + } + yield path, result + + elif self.config.schema == "seacrowd_sptext": + for i, audio_archive in enumerate(audio_archives): + for path, file in audio_archive: + _, filename = os.path.split(path) + if filename in metadata: + src_result = dict(metadata[filename]) + # set the audio feature and the path to the extracted file + path = os.path.join(local_extracted_archive_paths[i], path) + result = { + "id": src_result["path"].replace(".mp3", ""), + "path": path, + "audio": {"path": path, "bytes": file.read()}, + "text": src_result["sentence"], + "speaker_id": src_result["client_id"], + "metadata": { + "speaker_age": _AGE_TO_INT[src_result["age"]], + "speaker_gender": src_result["gender"], + }, + } + yield path, result diff --git a/seacrowd/sea_datasets/cosem/__init__.py b/seacrowd/sea_datasets/cosem/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/cosem/cosem.py b/seacrowd/sea_datasets/cosem/cosem.py new file mode 100644 index 000000000..e8999b929 --- /dev/null +++ b/seacrowd/sea_datasets/cosem/cosem.py @@ -0,0 +1,173 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = """\ +@article{gonzales_corpus_2021, + title = {The {Corpus} of {Singapore} {English} {Messages} ({CoSEM})}, + issn = {0883-2919, 1467-971X}, + url = {https://onlinelibrary.wiley.com/doi/10.1111/weng.12534}, + doi = {10.1111/weng.12534}, + language = {en}, + urldate = {2022-02-19}, + journal = {World Englishes}, + author = {Gonzales, Wilkinson Daniel Wong and Hiramoto, Mie and R. E. Leimgruber, Jakob and Lim, Jun Jie}, + month = feb, + year = {2021}, +} +""" + +_DATASETNAME = "cosem" + +_DESCRIPTION = """\ +The CoSEM dataset consists of over 900,000 lines of online messages from the messaging platform WhatsApp collected from personal chat +logs of students enrolled in an advanced sociolinguistics class from the National University of Singapore. Messages collected were +from 2016 to 2019. The dataset is in .txt format, where each line of utterance is tagged with a unique identifier that includes its +metadata such as line number, year message was sent, and age and nationality of sender. +""" + +_HOMEPAGE = "https://github.com/wdwgonzales/CoSEM/blob/main/Corpus/COSEM_v4_publicrelease_SEP172023.zip" + +_LANGUAGES = ["eng"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CC0_1_0.value + +_LOCAL = False + +_URLS = {_DATASETNAME: "https://github.com/wdwgonzales/CoSEM/raw/main/Corpus/COSEM_v4_publicrelease_SEP172023.zip"} + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] +_SUPPORTED_SCHEMA_STRINGS = [f"seacrowd_{str(TASK_TO_SCHEMA[task]).lower()}" for task in _SUPPORTED_TASKS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class CoSEMDataset(datasets.GeneratorBasedBuilder): + """The CoSEM dataset consists of over 900,000 lines of online messages from the messaging platform WhatsApp collected from + personal chat logs of students enrolled in an advanced sociolinguistics class from the National University of Singapore.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + subset_id = _DATASETNAME + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{subset_id}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=subset_id, + ) + ] + + seacrowd_schema_config: list[SEACrowdConfig] = [] + + for seacrowd_schema in _SUPPORTED_SCHEMA_STRINGS: + + seacrowd_schema_config.append( + SEACrowdConfig( + name=f"{subset_id}_{seacrowd_schema}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {seacrowd_schema} schema", + schema=f"{seacrowd_schema}", + subset_id=subset_id, + ) + ) + + BUILDER_CONFIGS.extend(seacrowd_schema_config) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{str(TASK_TO_SCHEMA[Tasks.SELF_SUPERVISED_PRETRAINING]).lower()}": + features = schemas.ssp_features + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + split_generators = [] + + path = dl_manager.download_and_extract(_URLS[_DATASETNAME]) + + split_generators.append( + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "path": os.path.join(path, "COSEM_v4_publicrelease_SEP172023"), + }, + ) + ) + + return split_generators + + def _generate_examples(self, path: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + files = os.listdir(path) + file_paths = [os.path.join(path, file) for file in files] + pattern = r"<(COSEM:.*?)>(.*?)(?= datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "sentence": datasets.Value("string"), + "ent1": datasets.Value("string"), + "ent2": datasets.Value("string"), + "ent1_qcode": datasets.Value("string"), + "ent2_qcode": datasets.Value("string"), + "property": datasets.Value("string"), + "property_desc": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "edgeset_left": datasets.Sequence(datasets.Value("int32")), + "edgeset_right": datasets.Sequence(datasets.Value("int32")), + "edgeset_triple": datasets.Sequence(datasets.Value("string")), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_paths = { + "csv": Path(dl_manager.download_and_extract(_URLS["csv"])), + "json": Path(dl_manager.download_and_extract(_URLS["json"])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "csv_filepath": data_paths["csv"], + "json_filepath": data_paths["json"], + "split": "train", + }, + ) + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + def _generate_examples(self, csv_filepath: Path, json_filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # read csv file + with open(csv_filepath, "r", encoding="utf-8") as csv_file: + csv_reader = csv.reader(csv_file) + csv_data = [row for row in csv_reader] + csv_data = csv_data[1:] # remove header + + # read json file + with open(json_filepath, "r", encoding="utf-8") as json_file: + json_data = json.load(json_file) + + # properties descriptions from https://github.com/hclent/CreoleVal/tree/main/nlu/relation_classification + # for properties present in Chavacano subset + properties_desc = {"P17": "country", "P30": "continent", "P106": "occupation", "P131": "located in the administrative territorial entity", "P361": "part of", "P1376": " capital of country"} + + num_sample = len(csv_data) + + for i in range(num_sample): + if self.config.schema == "source": + example = { + "id": str(i), + "sentence": csv_data[i][0], + "ent1": csv_data[i][1], + "ent2": csv_data[i][2], + "ent1_qcode": csv_data[i][3], + "ent2_qcode": csv_data[i][4], + "property": csv_data[i][5], + "property_desc": properties_desc[csv_data[i][5]], + "tokens": json_data[i]["tokens"], + "edgeset_left": json_data[i]["edgeSet"]["left"], + "edgeset_right": json_data[i]["edgeSet"]["right"], + "edgeset_triple": json_data[i]["edgeSet"]["triple"], + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + offset_entity1 = csv_data[i][0].find(csv_data[i][1]) + offset_entity2 = csv_data[i][0].find(csv_data[i][2]) + + if (offset_entity1 == -1) or (offset_entity2 == -1): + continue + + example = { + "id": str(i), + "passages": [{"id": f"passage-{i}", "type": "text", "text": [csv_data[i][0]], "offsets": [[0, len(csv_data[i][0])]]}], + "entities": [ + {"id": f"{i}-entity-{csv_data[i][3]}", "type": "text", "text": [csv_data[i][1]], "normalized": [{"db_name": csv_data[i][1], "db_id": csv_data[i][3]}], "offsets": [[offset_entity1, offset_entity1 + len(csv_data[i][1])]]}, + {"id": f"{i}-entity-{csv_data[i][4]}", "type": "text", "text": [csv_data[i][2]], "normalized": [{"db_name": csv_data[i][2], "db_id": csv_data[i][4]}], "offsets": [[offset_entity2, offset_entity2 + len(csv_data[i][2])]]}, + ], + "events": [], + "coreferences": [], + "relations": [ + { + "id": f"{i}-relation-{csv_data[i][5]}", + "type": properties_desc[csv_data[i][5]], + "arg1_id": f"{i}-entity-{csv_data[i][3]}", + "arg2_id": f"{i}-entity-{csv_data[i][4]}", + "normalized": [{"db_name": properties_desc[csv_data[i][5]], "db_id": csv_data[i][5]}], + } + ], + } + + yield i, example diff --git a/seacrowd/sea_datasets/crosssum/__init__.py b/seacrowd/sea_datasets/crosssum/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/crosssum/crosssum.py b/seacrowd/sea_datasets/crosssum/crosssum.py new file mode 100644 index 000000000..1a40d5909 --- /dev/null +++ b/seacrowd/sea_datasets/crosssum/crosssum.py @@ -0,0 +1,140 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproceedings{bhattacharjee-etal-2023-crosssum, + author = {Bhattacharjee, Abhik and Hasan, Tahmid and Ahmad, Wasi Uddin and Li, Yuan-Fang and Kang, Yong-Bin and Shahriyar, Rifat}, + title = {CrossSum: Beyond English-Centric Cross-Lingual Summarization for 1,500+ Language Pairs}, + booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics}, + publisher = {Association for Computational Linguistics}, + year = {2023}, + url = {https://aclanthology.org/2023.acl-long.143}, + doi = {10.18653/v1/2023.acl-long.143}, + pages = {2541--2564}, + } +""" + +_LOCAL = False +_LANGUAGES = ["ind", "mya", "vie"] +_DATASETNAME = "crosssum" +_DESCRIPTION = """ +This is a large-scale cross-lingual summarization dataset containing article-summary samples in 1,500+ language pairs, +including pairs with the Burmese, Indonesian and Vietnamese languages. Articles in the first language are assigned +summaries in the second language. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/csebuetnlp/CrossSum" +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value +_URL = "https://huggingface.co/datasets/csebuetnlp/CrossSum" + + +_SUPPORTED_TASKS = [Tasks.CROSS_LINGUAL_SUMMARIZATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class CrossSumDataset(datasets.GeneratorBasedBuilder): + """Dataset of cross-lingual article-summary samples.""" + + SUBSETS = [ + "ind_mya", + "ind_vie", + "mya_ind", + "mya_vie", + "vie_mya", + "vie_ind", + ] + LANG_CODE_MAPPER = {"ind": "indonesian", "mya": "burmese", "vie": "vietnamese"} + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for {subset} subset", + schema="source", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SUBSETS + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for {subset} subset", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SUBSETS + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_ind_mya_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "source_url": datasets.Value("string"), + "target_url": datasets.Value("string"), + "summary": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # dl_manager not used since dataloader uses HF 'load_dataset' + return [ + datasets.SplitGenerator(name=split, gen_kwargs={"split": split._name}) + for split in ( + datasets.Split.TRAIN, + datasets.Split.VALIDATION, + datasets.Split.TEST, + ) + ] + + def _load_hf_data_from_remote(self, split: str) -> datasets.DatasetDict: + """Load dataset from HuggingFace.""" + source_lang = self.LANG_CODE_MAPPER[self.config.subset_id.split("_")[-2]] + target_lang = self.LANG_CODE_MAPPER[self.config.subset_id.split("_")[-1]] + HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) + _hf_dataset_source = datasets.load_dataset(HF_REMOTE_REF, f"{source_lang}-{target_lang}", split=split) + return _hf_dataset_source + + def _generate_examples(self, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data = self._load_hf_data_from_remote(split) + for index, row in enumerate(data): + if self.config.schema == "source": + example = row + elif self.config.schema == "seacrowd_t2t": + example = {"id": str(index), "text_1": row["text"], "text_2": row["summary"], "text_1_name": "document", "text_2_name": "summary"} + yield index, example diff --git a/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py b/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py index e14cc16a4..acc7cc21d 100644 --- a/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py +++ b/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py @@ -27,6 +27,9 @@ are required at least 10 words, without any information on subcategories and actions. """ +_LOCAL=False +_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + _HOMEPAGE = "https://github.com/share424/Indonesian-Text-to-Image-synthesis-with-Sentence-BERT-and-FastGAN" _LICENSE = Licenses.UNKNOWN.value _URLS = { diff --git a/seacrowd/sea_datasets/culturax/culturax.py b/seacrowd/sea_datasets/culturax/culturax.py index 1a90f92e5..db5899492 100644 --- a/seacrowd/sea_datasets/culturax/culturax.py +++ b/seacrowd/sea_datasets/culturax/culturax.py @@ -29,8 +29,12 @@ mC4 and OSCAR corpora, emphasizing non-English languages to support multilingual model training. For data cleaning validation, CulturaX employs a SentencePiece tokenizer and KenLM language models, utilizing recent Wikipedia dumps for perplexity scoring. +Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/uonlp/CulturaX and use huggingface-cli login for authentication. """ +_LOCAL=False +_LANGUAGES = ["ind", "jav", "khm", "lao", "tgl", "min", "mya", "sun", "tha", "vie", "zlm", "ceb", "war", "cbk", "bcl"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + _HOMEPAGE = "https://huggingface.co/datasets/uonlp/CulturaX" _LICENSE = f"""{Licenses.OTHERS.value} | \ The licence terms for CulturaX strictly follows those of mC4 and OSCAR. \ diff --git a/seacrowd/sea_datasets/dengue_filipino/__init__.py b/seacrowd/sea_datasets/dengue_filipino/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py b/seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py new file mode 100644 index 000000000..3aaa30d05 --- /dev/null +++ b/seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py @@ -0,0 +1,134 @@ +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@INPROCEEDINGS{8459963, + author={E. D. {Livelo} and C. {Cheng}}, + booktitle={2018 IEEE International Conference on Agents (ICA)}, + title={Intelligent Dengue Infoveillance Using Gated Recurrent Neural Learning and Cross-Label Frequencies}, + year={2018}, + volume={}, + number={}, + pages={2-7}, + doi={10.1109/AGENTS.2018.8459963}} + } +""" + +_LANGUAGES = ["fil"] + +# copied from https://huggingface.co/datasets/dengue_filipino/blob/main/dengue_filipino.py +_URL = "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/dengue/dengue_raw.zip" +_DATASETNAME = "dengue_filipino" + +_DESCRIPTION = """\ +Benchmark dataset for low-resource multi-label classification, with 4,015 training, 500 testing, and 500 validation examples, each labeled as part of five classes. Each sample can be a part of multiple classes. Collected as tweets. +""" + +_HOMEPAGE = "https://github.com/jcblaisecruz02/Filipino-Text-Benchmarks" + +_LICENSE = Licenses.UNKNOWN.value + +_SUPPORTED_TASKS = [Tasks.DOMAIN_KNOWLEDGE_MULTICLASSIFICATION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class DengueFilipinoDataset(datasets.GeneratorBasedBuilder): + """Dengue Dataset Low-Resource Multi-label Text Classification Dataset in Filipino""" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text_multi", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema text multi", + schema="seacrowd_text_multi", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "absent": datasets.features.ClassLabel(names=["0", "1"]), + "dengue": datasets.features.ClassLabel(names=["0", "1"]), + "health": datasets.features.ClassLabel(names=["0", "1"]), + "mosquito": datasets.features.ClassLabel(names=["0", "1"]), + "sick": datasets.features.ClassLabel(names=["0", "1"]), + } + ) + elif self.config.schema == "seacrowd_text_multi": + features = schemas.text_multi_features(["0", "1"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "split": "validation", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "split": "test", + }, + ), + ] + + def _generate_examples(self, split: str) -> Tuple[int, Dict]: + dataset = datasets.load_dataset(_DATASETNAME, split=split) + for id, data in enumerate(dataset): + if self.config.schema == "source": + yield id, { + "text": data["text"], + "absent": data["absent"], + "dengue": data["dengue"], + "health": data["health"], + "mosquito": data["mosquito"], + "sick": data["sick"], + } + + elif self.config.schema == "seacrowd_text_multi": + yield id, { + "id": id, + "text": data["text"], + "labels": [ + data["absent"], + data["dengue"], + data["health"], + data["mosquito"], + data["sick"], + ], + } diff --git a/seacrowd/sea_datasets/emotes_3k/__init__.py b/seacrowd/sea_datasets/emotes_3k/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/emotes_3k/emotes_3k.py b/seacrowd/sea_datasets/emotes_3k/emotes_3k.py new file mode 100644 index 000000000..f980cbd03 --- /dev/null +++ b/seacrowd/sea_datasets/emotes_3k/emotes_3k.py @@ -0,0 +1,238 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +English-Tagalog Parallel Dataset intended for two tasks: +1. Moral Text Classification +2. Instruction Tuning +""" +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{Catapang:2023, + author = {Catapang, Jasper Kyle and Visperas, Moses}, + title = {Emotion-based Morality in Tagalog and English Scenarios (EMoTES-3K): A Parallel Corpus for Explaining (Im)morality of Actions}, + booktitle = {Proceedings of the Joint 3rd NLP4DH and 8th IWCLUL}, + pages = {1--6}, + month = {December 1-3}, + year = {2023}, + organization = {Association for Computational Linguistics}, +} +""" + +_DATASETNAME = "emotes_3k" + +_DESCRIPTION = """\ +This dataset is used on the paper "Emotion-based Morality in Tagalog and English Scenarios (EMoTES-3K): A Parallel Corpus for Explaining (Im)morality of Actions" +This dataset is designed for for two tasks: +1. Moral Text Classification +2. Instruction Tuning +""" + +_HOMEPAGE = "https://huggingface.co/datasets/NLPinas/EMoTES-3K" + +_LANGUAGES = ["tgl"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://huggingface.co/datasets/NLPinas/EMoTES-3K/resolve/main/EMoTES-3K.jsonl?download=true", +} + +_SUPPORTED_TASKS = [Tasks.MORALITY_CLASSIFICATION, Tasks.INSTRUCTION_TUNING] # Roberta moral or immoral classification # FLAN-T5 Training + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class Emotes3KDatasets(datasets.GeneratorBasedBuilder): + """ + Emotes3K consists of one human annotated dataset for the purpose of morality classification and instruction tuning. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_text", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_eng_seacrowd_text", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_text", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_tgl_seacrowd_text", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_text", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_eng_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_tgl_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "entry_id": datasets.Value("string"), + "Filipino": datasets.Value("string"), + "English": datasets.Value("string"), + "Annotation": datasets.ClassLabel(names=["Moral", "Immoral"]), + "Explanation": datasets.Value("string"), + "Personality Traits": datasets.Value("string"), + "Topic": datasets.Value("string"), + "Topic Name": datasets.Value("string"), + } + ) + # For example seacrowd_kb, seacrowd_t2t + elif self.config.schema == "seacrowd_text": + features = schemas.text.features(["Moral", "Immoral"]) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text_to_text.features + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + path = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": path, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + with open(filepath, "r", encoding="utf-8") as file: + for line in file: + # Use json.loads to parse each line as a JSON object + data = json.loads(line.strip()) + + if self.config.schema == "source": + yield ( + data["entry_id"], + { + "entry_id": data["entry_id"], + "Filipino": data["Filipino"], + "English": data["English"], + "Annotation": data["Annotation"], + "Explanation": data["Explanation"], + "Personality Traits": data["Personality Traits"], + "Topic": data["Topic"], + "Topic Name": data["Topic Name"], + }, + ) + elif self.config.schema == "seacrowd_text": + if "eng" in self.config.name or self.config.name == "emotes_3k_seacrowd_text": + yield ( + data["entry_id"], + { + "id": data["entry_id"], + "text": data["English"], + "label": data["Annotation"], + }, + ) + elif "tgl" in self.config.name: + yield ( + data["entry_id"], + { + "id": data["entry_id"], + "text": data["Filipino"], + "label": data["Annotation"], + }, + ) + elif self.config.schema == "seacrowd_t2t": + if "eng" in self.config.name or self.config.name == "emotes_3k_seacrowd_t2t": + yield ( + data["entry_id"], + { + "id": data["entry_id"], + "text_1": "Explain the morality of this scenario\n" + data["English"], + "text_2": data["Explanation"], + "text_1_name": "prompt", + "text_2_name": "system", + }, + ) + elif "tgl" in self.config.name: + yield ( + data["entry_id"], + { + "id": data["entry_id"], + "text_1": "Explain the morality of this scenario\n" + data["Filipino"], + "text_2": data["Explanation"], + "text_1_name": "prompt", + "text_2_name": "system", + }, + ) diff --git a/seacrowd/sea_datasets/etos/__init__.py b/seacrowd/sea_datasets/etos/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/etos/etos.py b/seacrowd/sea_datasets/etos/etos.py new file mode 100644 index 000000000..5f035cac9 --- /dev/null +++ b/seacrowd/sea_datasets/etos/etos.py @@ -0,0 +1,205 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import conllu +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@INPROCEEDINGS{10053062, + author={Samsuri, Mukhlizar Nirwan and Yuliawati, Arlisa and Alfina, Ika}, + booktitle={2022 5th International Seminar on Research of Information Technology and Intelligent Systems (ISRITI)}, + title={A Comparison of Distributed, PAM, and Trie Data Structure Dictionaries in Automatic Spelling Correction for Indonesian Formal Text}, + year={2022}, + pages={525-530}, + keywords={Seminars;Dictionaries;Data structures;Intelligent systems;Information technology;automatic spelling correction;distributed dictionary;non-word error;trie data structure;Partition Around Medoids}, + doi={10.1109/ISRITI56927.2022.10053062}, + url = {https://ieeexplore.ieee.org/document/10053062}, +} +""" + +_DATASETNAME = "etos" + +_DESCRIPTION = """\ +ETOS (Ejaan oTOmatiS) is a dataset for parts-of-speech (POS) tagging for formal Indonesian +text. It consists of 200 sentences, with 4,323 tokens in total, annotated following the +CoNLL format. +""" + +_HOMEPAGE = "https://github.com/ir-nlp-csui/etos" + +_LANGUAGES = ["ind"] + +_LICENSE = Licenses.AGPL_3_0.value + +_LOCAL = False + +_URLS = "https://raw.githubusercontent.com/ir-nlp-csui/etos/main/gold_standard.conllu" + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class ETOSDataset(datasets.GeneratorBasedBuilder): + """ + ETOS is an Indonesian parts-of-speech (POS) tagging dataset from https://github.com/ir-nlp-csui/etos. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + UPOS_TAGS = [ + "NOUN", + "PUNCT", + "ADP", + "NUM", + "SYM", + "SCONJ", + "ADJ", + "PART", + "DET", + "CCONJ", + "PROPN", + "PRON", + "X", + "_", + "ADV", + "INTJ", + "VERB", + "AUX", + ] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_seq_label", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} sequence labeling schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "sent_id": datasets.Value("string"), + "text": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "lemmas": datasets.Sequence(datasets.Value("string")), + "upos": datasets.Sequence(datasets.features.ClassLabel(names=self.UPOS_TAGS)), + "xpos": datasets.Sequence(datasets.Value("string")), + "feats": datasets.Sequence(datasets.Value("string")), + "head": datasets.Sequence(datasets.Value("string")), + "deprel": datasets.Sequence(datasets.Value("string")), + "deps": datasets.Sequence(datasets.Value("string")), + "misc": datasets.Sequence(datasets.Value("string")), + } + ) + + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(self.UPOS_TAGS) + + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + train_path = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_path, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + + with open(filepath, "r", encoding="utf-8") as data_file: + tokenlist = list(conllu.parse_incr(data_file)) + + for idx, sent in enumerate(tokenlist): + if "sent_id" in sent.metadata: + sent_id = sent.metadata["sent_id"] + else: + sent_id = idx + + tokens = [token["form"] for token in sent] + + if "text" in sent.metadata: + txt = sent.metadata["text"] + else: + txt = " ".join(tokens) + + if self.config.schema == "source": + yield idx, { + "sent_id": str(sent_id), + "text": txt, + "tokens": tokens, + "lemmas": [token["lemma"] for token in sent], + "upos": [token["upos"] for token in sent], + "xpos": [token["xpos"] for token in sent], + "feats": [str(token["feats"]) for token in sent], + "head": [str(token["head"]) for token in sent], + "deprel": [str(token["deprel"]) for token in sent], + "deps": [str(token["deps"]) for token in sent], + "misc": [str(token["misc"]) for token in sent], + } + + elif self.config.schema == "seacrowd_seq_label": + yield idx, { + "id": str(sent_id), + "tokens": tokens, + "labels": [token["upos"] for token in sent], + } + + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") diff --git a/seacrowd/sea_datasets/filipino_gay_lang/__init__.py b/seacrowd/sea_datasets/filipino_gay_lang/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/filipino_gay_lang/filipino_gay_lang.py b/seacrowd/sea_datasets/filipino_gay_lang/filipino_gay_lang.py new file mode 100644 index 000000000..66a9aec16 --- /dev/null +++ b/seacrowd/sea_datasets/filipino_gay_lang/filipino_gay_lang.py @@ -0,0 +1,115 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import SCHEMA_TO_FEATURES, TASK_TO_SCHEMA, Tasks + +# TODO: Add BibTeX citation +_CITATION = r"""\ +@article{oco2015witchebelles, + author = {Oco, Nathaniel and Fajutagana, Raymart and Lim, Christine Mae and Mi{\~n}on, Judi Diane and Morano, Julie-Ann and Tinoco, Ryan Christian}, + title = {Witchebelles Anata Magcharot kay Mudra na Nagsusuba si Akech: Developing a Rule-based Unidirectional Beki Lingo to Filipino Translator}, + journal = {Journal of Sciences, Technology and Arts Research}, + volume = {1}, + number = {1}, + year = {2015} +} +""" + +_LOCAL = False +_LANGUAGES = ["fil"] +_DATASETNAME = "filipino_gay_lang" +_DESCRIPTION = """\ +The dataset contains 4000+ Filipino tweets in gay language or lingo also called swardspeak in slang terminology. +The tweet dataset was collected from February 2013 to November 2014 using the following commonly used gay words as filters: jinet ("hot"), ditey ("here"), imbyerna ("annoying"), etc. +The original paper makes use of the corpus to develop a gay language translator to understand the meaning of phrases using gay words in Filipino. +""" + +_HOMEPAGE = "https://github.com/imperialite/Philippine-Languages-Online-Corpora/tree/master/Tweets/Gay%20language" +_LICENSE = "CC-BY-SA 4.0" +_URLS = { + "gl_01": "https://github.com/imperialite/Philippine-Languages-Online-Corpora/raw/master/Tweets/Gay%20language/gl%20-%2001.xlsx", + "gl_02": "https://github.com/imperialite/Philippine-Languages-Online-Corpora/raw/master/Tweets/Gay%20language/gl%20-%2002.xlsx", + "gl_03": "https://github.com/imperialite/Philippine-Languages-Online-Corpora/raw/master/Tweets/Gay%20language/gl%20-%2003.xlsx", +} + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class FilipinoGayLangDataset(datasets.GeneratorBasedBuilder): + """This dataset contains 4000+ Filipino tweets in gay lingo/Beki/Swardspeak.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"index": datasets.Value("string"), "text": datasets.Value("string")}) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = SCHEMA_TO_FEATURES[self.SEACROWD_SCHEMA_NAME.upper()] + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_files = { + "gl_01": Path(dl_manager.download(_URLS["gl_01"])), + "gl_02": Path(dl_manager.download(_URLS["gl_02"])), + "gl_03": Path(dl_manager.download(_URLS["gl_03"])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": [data_files["gl_01"], data_files["gl_02"], data_files["gl_03"]], "split": "train"}, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + df = pd.concat((pd.read_excel(file) for file in filepath), ignore_index=True).reset_index() + + if self.config.schema == "source": + for row in df.itertuples(): + ex = {"index": str(row.index), "text": row.message} + yield row.index, ex + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for row in df.itertuples(): + ex = {"id": str(row.index), "text": row.message} + yield row.index, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/filipino_hatespeech_election/__init__.py b/seacrowd/sea_datasets/filipino_hatespeech_election/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/filipino_hatespeech_election/filipino_hatespeech_election.py b/seacrowd/sea_datasets/filipino_hatespeech_election/filipino_hatespeech_election.py new file mode 100644 index 000000000..50962df6d --- /dev/null +++ b/seacrowd/sea_datasets/filipino_hatespeech_election/filipino_hatespeech_election.py @@ -0,0 +1,124 @@ +import csv +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@article{Cabasag-2019-hate-speech, + title={Hate speech in Philippine election-related tweets: Automatic detection and classification using natural language processing.}, + author={Neil Vicente Cabasag, Vicente Raphael Chan, Sean Christian Lim, Mark Edward Gonzales, and Charibeth Cheng}, + journal={Philippine Computing Journal}, + volume={XIV}, + number={1}, + month={August}, + year={2019} +} +""" + +_DATASETNAME = "filipino_hatespeech_election" + +_DESCRIPTION = """ +The dataset used in this study was a subset of the corpus 1,696,613 tweets crawled by Andrade et al. and posted from November 2015 to May 2016 during the campaign period for the Philippine presidential election. They were culled +based on the presence of candidate names (e.g., Binay, Duterte, Poe, Roxas, and Santiago) and election-related hashtags (e.g., #Halalan2016, #Eleksyon2016, and #PiliPinas2016). Data preprocessing was performed to prepare the +tweets for feature extraction and classification. It consisted of the following steps: data de-identification, uniform resource locator (URL) removal, special character processing, normalization, hashtag processing, and tokenization. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/hate_speech_filipino" + +_LANGUAGES = ["fil"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = {_DATASETNAME: "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/hatenonhate/hatespeech_raw.zip"} + +_SUPPORTED_TASKS = [Tasks.ABUSIVE_LANGUAGE_PREDICTION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_CLASSES = ["0", "1"] # corresponds to ["non-hate-containing", "hate-containing"] + + +class FilipinoHatespeechElectionDataset(datasets.GeneratorBasedBuilder): + """Hate Speech Text Classification Dataset in Filipino.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_text", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + features = schemas.text_features(label_names=_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "hatespeech", "train.csv"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "hatespeech", "test.csv"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "hatespeech", "valid.csv"), + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + with open(filepath, encoding="utf-8") as csv_file: + csv_reader = csv.reader(csv_file, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True) + next(csv_reader) + for i, row in enumerate(csv_reader): + try: + text, label = row + yield i, {"id": str(i), "text": row[0], "label": _CLASSES[int(row[1].strip()) - 1]} + except ValueError: + pass diff --git a/seacrowd/sea_datasets/filwordnet/__init__.py b/seacrowd/sea_datasets/filwordnet/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/filwordnet/filwordnet.py b/seacrowd/sea_datasets/filwordnet/filwordnet.py new file mode 100644 index 000000000..a2d66efdc --- /dev/null +++ b/seacrowd/sea_datasets/filwordnet/filwordnet.py @@ -0,0 +1,146 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses + +_CITATION = """\ +@article{article, +author = {Borra, Allan and Pease, Adam and Edita, Rachel and Roxas, and Dita, Shirley}, +year = {2010}, +month = {01}, +pages = {}, +title = {Introducing Filipino WordNet} +} +""" + +_DATASETNAME = "filwordnet" + +_DESCRIPTION = """\ +Filipino WordNet (FilWordNet) is a lexical database of Filipino language. +It was derived from the Princeton WordNet and translated by humans to Filipino. +It documents 13,539 unique words and 9,519 synsets. Each synset includes the definition, +part-of-speech, word senses, and Suggested Upper Merged Ontology terms (SUMO terms). +""" + +_HOMEPAGE = "https://github.com/danjohnvelasco/Filipino-WordNet" + +_LANGUAGES = ["fil"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/danjohnvelasco/Filipino-WordNet/main/filwordnet.csv", +} + +_SUPPORTED_TASKS = [] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class FilWordNetDataset(datasets.GeneratorBasedBuilder): + """The Filipino WordNet (FilWordNet) is a lexical database of Filipino language containing 13,539 unique words and 9,519 synsets.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ) + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "word_id": datasets.Value("int32"), + "lemma": datasets.Value("string"), + "synset_id": datasets.Value("int32"), + "sense_id": datasets.Value("int32"), + "pos": datasets.Value("string"), + "lexdomain_id": datasets.Value("int32"), + "definition": datasets.Value("string"), + "last_modifier": datasets.Value("int32"), + "sumo": datasets.Value("string"), + } + ) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + file = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": file, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + rows = [] + is_first_row = True + with open(filepath, "r") as file: + csv_reader = csv.reader(file, delimiter=",") + for row in csv_reader: + if is_first_row: # skip first row, they are column names + is_first_row = False + continue + + rows.append(row) + + if self.config.schema == "source": + for key, row in enumerate(rows): + example = { + "word_id": row[0], + "lemma": row[1], + "synset_id": row[2], + "sense_id": row[3], + "pos": row[4], + "lexdomain_id": row[5], + "definition": row[6], + "last_modifier": row[7], + "sumo": row[8], + } + yield key, example diff --git a/seacrowd/sea_datasets/flores200/__init__.py b/seacrowd/sea_datasets/flores200/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/flores200/flores200.py b/seacrowd/sea_datasets/flores200/flores200.py new file mode 100644 index 000000000..eec7827ee --- /dev/null +++ b/seacrowd/sea_datasets/flores200/flores200.py @@ -0,0 +1,475 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import zipfile +from dataclasses import dataclass +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (SCHEMA_TO_FEATURES, TASK_TO_SCHEMA, + Licenses, Tasks) + +_CITATION = """\ +@article{nllb2022, + author = {NLLB Team, Marta R. Costa-jussà, James Cross, Onur Çelebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe + Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic + Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon + Spruit, Chau Tran, Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, + Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Jeff Wang}, + title = {No Language Left Behind: Scaling Human-Centered Machine Translation}, + year = {2022} +} +@inproceedings{, + title={The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, + author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\'{a}n, Francisco and Fan, Angela}, + year={2021} +} +@inproceedings{, + title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, + author={Guzm\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, + journal={arXiv preprint arXiv:1902.01382}, + year={2019} +} +""" + +_DATASETNAME = "flores200" + +_DESCRIPTION = """\ +The creation of FLORES-200 doubles the existing language coverage of FLORES-101. +Given the nature of the new languages, which have less standardization and require +more specialized professional translations, the verification process became more complex. +This required modifications to the translation workflow. FLORES-200 has several languages +which were not translated from English. Specifically, several languages were translated +from Spanish, French, Russian and Modern Standard Arabic. Moreover, FLORES-200 also +includes two script alternatives for four languages. FLORES-200 consists of translations +from 842 distinct web articles, totaling 3001 sentences. These sentences are divided +into three splits: dev, devtest, and test (hidden). On average, sentences are approximately +21 words long. +""" + +_HOMEPAGE = "https://github.com/facebookresearch/flores" + +_LANGUAGES = [ + "ace", + "ban", + "bjn", + "bug", + "ceb", + "ilo", + "ind", + "jav", + "kac", + "khm", + "lao", + "lus", + "min", + "mya", + "pag", + "shn", + "sun", + "tgl", + "tha", + "vie", + "war", + "zsm", +] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LANGUAGE_NAMES = [ + "ace_Arab", + "ace_Latn", + "acm_Arab", + "acq_Arab", + "aeb_Arab", + "afr_Latn", + "ajp_Arab", + "aka_Latn", + "als_Latn", + "amh_Ethi", + "apc_Arab", + "arb_Arab", + "arb_Latn", + "ars_Arab", + "ary_Arab", + "arz_Arab", + "asm_Beng", + "ast_Latn", + "awa_Deva", + "ayr_Latn", + "azb_Arab", + "azj_Latn", + "bak_Cyrl", + "bam_Latn", + "ban_Latn", + "bel_Cyrl", + "bem_Latn", + "ben_Beng", + "bho_Deva", + "bjn_Arab", + "bjn_Latn", + "bod_Tibt", + "bos_Latn", + "bug_Latn", + "bul_Cyrl", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cjk_Latn", + "ckb_Arab", + "cmn_Hans", + "cmn_Hant", + "crh_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "dik_Latn", + "dyu_Latn", + "dzo_Tibt", + "ell_Grek", + "eng_Latn", + "epo_Latn", + "est_Latn", + "eus_Latn", + "ewe_Latn", + "fao_Latn", + "fij_Latn", + "fin_Latn", + "fon_Latn", + "fra_Latn", + "fur_Latn", + "fuv_Latn", + "gla_Latn", + "gle_Latn", + "glg_Latn", + "grn_Latn", + "guj_Gujr", + "hat_Latn", + "hau_Latn", + "heb_Hebr", + "hin_Deva", + "hne_Deva", + "hrv_Latn", + "hun_Latn", + "hye_Armn", + "ibo_Latn", + "ilo_Latn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jav_Latn", + "jpn_Jpan", + "kab_Latn", + "kac_Latn", + "kam_Latn", + "kan_Knda", + "kas_Arab", + "kas_Deva", + "kat_Geor", + "knc_Arab", + "knc_Latn", + "kaz_Cyrl", + "kbp_Latn", + "kea_Latn", + "khm_Khmr", + "kik_Latn", + "kin_Latn", + "kir_Cyrl", + "kmb_Latn", + "kmr_Latn", + "kon_Latn", + "kor_Hang", + "lao_Laoo", + "lij_Latn", + "fil_Latn", + "lim_Latn", + "lin_Latn", + "lit_Latn", + "lmo_Latn", + "ltg_Latn", + "ltz_Latn", + "lua_Latn", + "lug_Latn", + "luo_Latn", + "lus_Latn", + "lvs_Latn", + "mag_Deva", + "mai_Deva", + "mal_Mlym", + "mar_Deva", + "min_Arab", + "min_Latn", + "mkd_Cyrl", + "plt_Latn", + "mlt_Latn", + "mni_Beng", + "khk_Cyrl", + "mos_Latn", + "mri_Latn", + "mya_Mymr", + "nld_Latn", + "nno_Latn", + "nob_Latn", + "npi_Deva", + "nqo_Nkoo", + "nso_Latn", + "nus_Latn", + "nya_Latn", + "oci_Latn", + "gaz_Latn", + "ory_Orya", + "pag_Latn", + "pan_Guru", + "pap_Latn", + "pes_Arab", + "pol_Latn", + "por_Latn", + "prs_Arab", + "pbt_Arab", + "quy_Latn", + "ron_Latn", + "run_Latn", + "rus_Cyrl", + "sag_Latn", + "san_Deva", + "sat_Olck", + "scn_Latn", + "shn_Mymr", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "smo_Latn", + "sna_Latn", + "snd_Arab", + "som_Latn", + "sot_Latn", + "spa_Latn", + "srd_Latn", + "srp_Cyrl", + "ssw_Latn", + "sun_Latn", + "swe_Latn", + "swh_Latn", + "szl_Latn", + "tam_Taml", + "tat_Cyrl", + "tel_Telu", + "tgk_Cyrl", + "tha_Thai", + "tir_Ethi", + "taq_Latn", + "taq_Tfng", + "tpi_Latn", + "tsn_Latn", + "tso_Latn", + "tuk_Latn", + "tum_Latn", + "tur_Latn", + "twi_Latn", + "uig_Arab", + "ukr_Cyrl", + "umb_Latn", + "urd_Arab", + "uzn_Latn", + "vec_Latn", + "vie_Latn", + "war_Latn", + "wol_Latn", + "xho_Latn", + "ydd_Hebr", + "yor_Latn", + "yue_Hant", + "zgh_Tfng", + "zsm_Latn", + "zul_Latn", +] + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/openlanguagedata/flores/releases/download/v2.0-alpha.2/floresp-v2.0-alpha.2.zip", +} + +_SPLITS = ["dev", "devtest"] + +_SENTENCES_PATHS = {lang: {split: os.path.join("floresp-v2.0-alpha.2", split, f"{split}.{lang}") for split in _SPLITS} for lang in _LANGUAGE_NAMES} + +_METADATA_PATHS = {split: os.path.join("floresp-v2.0-alpha.2", f"metadata_{split}.tsv") for split in _SPLITS} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] +_SUPPORTED_SCHEMA_STRINGS = [f"seacrowd_{str(TASK_TO_SCHEMA[task]).lower()}" for task in _SUPPORTED_TASKS] + +_SCHEMAS = [str(TASK_TO_SCHEMA[task]) for task in _SUPPORTED_TASKS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +@dataclass +class Flores200SeacrowdConfig(SEACrowdConfig): + """BuilderConfig for Nusantara.""" + + first_language_name: str = None + second_language_name: str = None + + +class Flores200(datasets.GeneratorBasedBuilder): + """ + The creation of FLORES-200 doubles the existing language coverage of FLORES-101. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [] + + for first_lang_name in _LANGUAGE_NAMES: + for second_lang_name in _LANGUAGE_NAMES: + if first_lang_name == second_lang_name or ((first_lang_name.split("_")[0] not in _LANGUAGES) and (second_lang_name.split("_")[0] not in _LANGUAGES)): + continue + + subset_id = f"{_DATASETNAME}_{first_lang_name}_{second_lang_name}" + + BUILDER_CONFIGS.append( + Flores200SeacrowdConfig( + name=f"{subset_id}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=subset_id, + first_language_name=first_lang_name, + second_language_name=second_lang_name, + ) + ) + + seacrowd_schema_config: list[SEACrowdConfig] = [] + + for seacrowd_schema in _SUPPORTED_SCHEMA_STRINGS: + + seacrowd_schema_config.append( + Flores200SeacrowdConfig( + name=f"{subset_id}_{seacrowd_schema}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {seacrowd_schema} schema", + schema=f"{seacrowd_schema}", + subset_id=subset_id, + first_language_name=first_lang_name, + second_language_name=second_lang_name, + ) + ) + + BUILDER_CONFIGS.extend(seacrowd_schema_config) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_LANGUAGE_NAMES[0]}_{_LANGUAGE_NAMES[1]}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("int32"), + "URL": datasets.Value("string"), + "domain": datasets.Value("string"), + "topic": datasets.Value("string"), + "has_image": datasets.Value("int32"), + "has_hyperlink": datasets.Value("int32"), + } + ) + + features[self.config.first_language_name] = datasets.Value("string") + features[self.config.second_language_name] = datasets.Value("string") + + else: + schema = str(self.config.schema).lstrip(f"{_DATASETNAME}_seacrowd_").upper() + + if schema in _SCHEMAS: + features = SCHEMA_TO_FEATURES[schema] + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + dl_dir = dl_manager.download(_URLS[_DATASETNAME]) + + base_dir = os.path.join(os.path.dirname(dl_dir), "flores200extracted") + + password = "multilingual machine translation" + + with zipfile.ZipFile(dl_dir, "r") as zip_ref: + # Set the password to extract the contents + zip_ref.setpassword(bytes(password, "utf-8")) + + # Extract all contents to the specified directory + zip_ref.extractall(base_dir) + + return [ + datasets.SplitGenerator( + name=split, + gen_kwargs={ + "first_sentence_path": os.path.join(base_dir, _SENTENCES_PATHS[self.config.first_language_name][split]), + "second_sentence_path": os.path.join(base_dir, _SENTENCES_PATHS[self.config.second_language_name][split]), + "metadata_path": os.path.join(base_dir, _METADATA_PATHS[split]), + }, + ) + for split in _SPLITS + ] + + def _generate_examples(self, first_sentence_path: str, second_sentence_path: str, metadata_path: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + sentences = {} + langs = [self.config.first_language_name, self.config.second_language_name] + + for path, lang in zip([first_sentence_path, second_sentence_path], langs): + with open(path, "r") as sent_file: + sentences[lang] = [line.strip() for line in sent_file.readlines()] + + with open(metadata_path, "r") as metadata_file: + metadata_lines = [line.strip() for line in metadata_file.readlines()[1:]] + + if self.config.schema == "source": + for id_, metadata in enumerate(metadata_lines): + metadata = metadata.split("\t") + yield id_, { + **{"id": id_ + 1, "URL": metadata[0], "domain": metadata[1], "topic": metadata[2], "has_image": 1 if metadata == "yes" else 0, "has_hyperlink": 1 if metadata == "yes" else 0}, + **{f"{lang}": sentences[lang][id_] for lang in langs}, + } + + elif self.config.schema == f"seacrowd_{str(TASK_TO_SCHEMA[Tasks.MACHINE_TRANSLATION]).lower()}": + for id_, _ in enumerate(metadata_lines): + yield id_, { + "id": id_ + 1, + "text_1": sentences[self.config.first_language_name][id_], + "text_2": sentences[self.config.second_language_name][id_], + "text_1_name": self.config.first_language_name, + "text_2_name": self.config.second_language_name, + } + + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/fsl_105/__init__.py b/seacrowd/sea_datasets/fsl_105/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/fsl_105/fsl_105.py b/seacrowd/sea_datasets/fsl_105/fsl_105.py new file mode 100644 index 000000000..e6adfe7bb --- /dev/null +++ b/seacrowd/sea_datasets/fsl_105/fsl_105.py @@ -0,0 +1,191 @@ +import os +from pathlib import Path, PureWindowsPath +from typing import Dict, List, Tuple + +import cv2 +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{tupal4476867fsl105, + title={FSL105: The Video Filipino Sign Language Sign Database of Introductory 105 FSL Signs}, + author={Tupal, Isaiah Jassen Lizaso and Melvin, Cabatuan K}, + journal={Available at SSRN 4476867} +} +""" + +_DATASETNAME = "fsl_105" + +_DESCRIPTION = """\ +FSL-105 is a video dataset for 105 different Filipino Sign Language (FSL) signs. +Each sign is categorized into one of 10 categories and is each represented by approximately 20 four-second video samples. +Signs were performed by adult deaf FSL signers on a blank blue background and reviewed by an FSL expert. +""" + +_HOMEPAGE = "https://data.mendeley.com/datasets/48y2y99mb9/2" + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = { + "clips": "https://prod-dcd-datasets-public-files-eu-west-1.s3.eu-west-1.amazonaws.com/de95a3c3-02f4-4a3f-9a9e-ce2371160275", + "train": "https://prod-dcd-datasets-public-files-eu-west-1.s3.eu-west-1.amazonaws.com/09c71779-3a2a-4c98-8d9b-0ef74f54d92a", + "test": "https://prod-dcd-datasets-public-files-eu-west-1.s3.eu-west-1.amazonaws.com/39af8117-6b44-47b9-a551-0bdc40837295", +} + +_LANGUAGES = ["psp"] + +_SUPPORTED_TASKS = [Tasks.VIDEO_TO_TEXT_RETRIEVAL, Tasks.VIDEO_CAPTIONING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class FSL105Dataset(datasets.GeneratorBasedBuilder): + """ + FSL-105 is a video dataset for 105 different Filipino Sign Language (FSL) signs. + Each sign is categorized into one of 10 categories and is each represented by approximately 20 four-second video samples. + Signs were performed by adult deaf FSL signers on a blank blue background and reviewed by an FSL expert. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_vidtext", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_vidtext", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + category = [ + "CALENDAR", + "COLOR", + "DAYS", + "DRINK", + "FAMILY", + "FOOD", + "GREETING", + "NUMBER", + "RELATIONSHIPS", + "SURVIVAL", + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "video_path": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": datasets.ClassLabel(names=self.category), + "metadata": { + "resolution": { + "width": datasets.Value("int64"), + "height": datasets.Value("int64"), + }, + "duration": datasets.Value("float32"), + "fps": datasets.Value("float32"), + }, + } + ) + + elif self.config.schema == "seacrowd_vidtext": + features = schemas.video_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + clips = dl_manager.download_and_extract(_URLS["clips"]) + train = dl_manager.download_and_extract(_URLS["train"]) + test = dl_manager.download_and_extract(_URLS["test"]) + + train_df = pd.read_csv(train) + test_df = pd.read_csv(test) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": { + "clips": clips, + "data": train_df, + }, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": {"clips": clips, "data": test_df}, + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + for key, example in filepath["data"].iterrows(): + video = cv2.VideoCapture(os.path.join(filepath["clips"], PureWindowsPath(example["vid_path"]).as_posix())) + fps = video.get(cv2.CAP_PROP_FPS) + frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT) + duration = frame_count / fps + vid_width = video.get(cv2.CAP_PROP_FRAME_WIDTH) + vid_height = video.get(cv2.CAP_PROP_FRAME_HEIGHT) + + if self.config.schema == "source": + yield key, { + "id": str(key), + "video_path": os.path.join(filepath["clips"], example["vid_path"]), + "text": example["label"], + "labels": example["category"], + "metadata": { + "resolution": { + "width": vid_width, + "height": vid_height, + }, + "duration": duration, + "fps": fps, + }, + } + elif self.config.schema == "seacrowd_vidtext": + yield key, { + "id": str(key), + "video_path": os.path.join(filepath["clips"], example["vid_path"]), + "text": example["label"], + "metadata": { + "resolution": { + "width": vid_width, + "height": vid_height, + }, + "duration": duration, + "fps": fps, + }, + } diff --git a/seacrowd/sea_datasets/gatitos/__init__.py b/seacrowd/sea_datasets/gatitos/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/gatitos/gatitos.py b/seacrowd/sea_datasets/gatitos/gatitos.py new file mode 100644 index 000000000..eb71661b0 --- /dev/null +++ b/seacrowd/sea_datasets/gatitos/gatitos.py @@ -0,0 +1,140 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The GATITOS (Google's Additional Translations Into Tail-languages: Often Short) dataset is a high-quality, multi-way parallel dataset of tokens and short phrases. +This dataset consists in 4,000 English segments (4,500 tokens) that have been translated into each of 173 languages, 170 of which are low-resource, 23 are spoken in Southeast Asia. +This dataset contains primarily short segments: 93% single tokens, and only 23 sentences (0.6%) have over 5 tokens. +As such it is best thought of as a multilingual lexicon, rather than a parallel training corpus. +The source text is frequent words in the English Language, along with some common phrases and short sentences. +Care has been taken to ensure that they include good coverage of numbers, months, days of the week, swadesh words, and names of the languages themselves (including the endonym). +""" +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@misc{jones2023bilex, + title={Bilex Rx: Lexical Data Augmentation for Massively Multilingual Machine Translation}, + author={Alex Jones and Isaac Caswell and Ishank Saxena and Orhan Firat}, + year={2023}, + eprint={2303.15265}, + archivePrefix={arXiv}, + primaryClass={cs.CL} + } +} +""" + +_DATASETNAME = "gatitos" + +_DESCRIPTION = """\ +The GATITOS (Google's Additional Translations Into Tail-languages: Often Short) dataset is a high-quality, multi-way parallel dataset of tokens and short phrases. +This dataset consists in 4,000 English segments (4,500 tokens) that have been translated into each of 173 languages, 170 of which are low-resource, 23 are spoken in Southeast Asia. +This dataset contains primarily short segments: 93% single tokens, and only 23 sentences (0.6%) have over 5 tokens. +As such it is best thought of as a multilingual lexicon, rather than a parallel training corpus. +The source text is frequent words in the English Language, along with some common phrases and short sentences. +Care has been taken to ensure that they include good coverage of numbers, months, days of the week, swadesh words, and names of the languages themselves (including the endonym). +""" + +_HOMEPAGE = "https://github.com/google-research/url-nlp/blob/main/gatitos/README.md" + +_LANGUAGES = ["ace", "ban", "bbc", "bew", "bjn", "bts", "btx", "bug", "cnh", "hil", "iba", "ilo", "kac", "lus", "mad", "mak", "meo", "min", "pag", "pam", "shn", "tet", "war"] + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLs = "https://raw.githubusercontent.com/google-research/url-nlp/main/gatitos/{src}_{tgt}.tsv" + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class GATITOSDataset(datasets.GeneratorBasedBuilder): + """The GATITOS (Google's Additional Translations Into Tail-languages: Often Short) dataset is a high-quality, multi-way parallel dataset of tokens and short phrases.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{src_lang}_{tgt_lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{src_lang}_{tgt_lang}", + ) + for (src_lang, tgt_lang) in [("eng", lang) for lang in _LANGUAGES] + [(lang, "eng") for lang in _LANGUAGES] + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{src_lang}_{tgt_lang}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_{src_lang}_{tgt_lang}", + ) + for (src_lang, tgt_lang) in [("eng", lang) for lang in _LANGUAGES] + [(lang, "eng") for lang in _LANGUAGES] + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "src_text": datasets.Value("string"), "tgt_text": datasets.Value("string")}) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + _, src_lang, tgt_lang = self.config.subset_id.split("_") + + filepath = dl_manager.download_and_extract(_URLs.format(src=src_lang.replace("eng", "en"), tgt=tgt_lang.replace("eng", "en"))) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={"filepath": filepath, "src_lang": src_lang, "tgt_lang": tgt_lang}, + ) + ] + + def _generate_examples(self, src_lang: str, tgt_lang: str, filepath: Path) -> Tuple[int, Dict]: + if self.config.schema == "source": + for row_id, row in enumerate(open(filepath)): + src_text, tgt_text = row.strip().split("\t") + yield row_id, {"id": row_id, "src_text": src_text, "tgt_text": tgt_text} + + elif self.config.schema == "seacrowd_t2t": + for row_id, row in enumerate(open(filepath)): + src_text, tgt_text = row.strip().split("\t") + yield row_id, {"id": row_id, "text_1": src_text, "text_2": tgt_text, "text_1_name": src_lang, "text_2_name": tgt_lang} diff --git a/seacrowd/sea_datasets/gklmip_newsclass/__init__.py b/seacrowd/sea_datasets/gklmip_newsclass/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/gklmip_newsclass/gklmip_newsclass.py b/seacrowd/sea_datasets/gklmip_newsclass/gklmip_newsclass.py new file mode 100644 index 000000000..1df09bc28 --- /dev/null +++ b/seacrowd/sea_datasets/gklmip_newsclass/gklmip_newsclass.py @@ -0,0 +1,171 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import numpy as np +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{, +author="Jiang, Shengyi +and Fu, Sihui +and Lin, Nankai +and Fu, Yingwen", +title="Pre-trained Models and Evaluation Data for the Khmer Language", +year="2021", +publisher="Tsinghua Science and Technology", +} +""" + +_DATASETNAME = "gklmip_newsclass" + +_DESCRIPTION = """\ +The GKLMIP Khmer News Dataset is scraped from the Voice of America Khmer website. \ +The news articles in the dataset are categorized into 8 categories: culture, economics, education, \ +environment, health, politics, rights and science. +""" + +_HOMEPAGE = "https://github.com/GKLMIP/Pretrained-Models-For-Khmer" +_LANGUAGES = ["khm"] + +_LICENSE = Licenses.UNKNOWN.value +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/GKLMIP/Pretrained-Models-For-Khmer/raw/main/NewsDataset.zip", +} + +_SUPPORTED_TASKS = [Tasks.TOPIC_MODELING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +_TAGS = ["culture", "economic", "education", "environment", "health", "politics", "right", "science"] + + +class GklmipNewsclass(datasets.GeneratorBasedBuilder): + """\ + The GKLMIP Khmer News Dataset is scraped from the Voice of America Khmer website. \ + The news articles in the dataset are categorized into 8 categories: culture, economics, education, \ + environment, health, politics, rights and science. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "culture": datasets.Value("bool"), + "economic": datasets.Value("bool"), + "education": datasets.Value("bool"), + "environment": datasets.Value("bool"), + "health": datasets.Value("bool"), + "politics": datasets.Value("bool"), + "right": datasets.Value("bool"), + "science": datasets.Value("bool"), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(_TAGS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.csv"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "test.csv"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "dev.csv"), + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + dataset = pd.read_csv(filepath) + reverse_encoding = dict(zip(range(len(_TAGS)), _TAGS)) + if self.config.schema == "source": + for i, row in dataset.iterrows(): + yield i, { + "text": row["text"], + "culture": row["culture"], + "economic": row["economic"], + "education": row["education"], + "environment": row["environment"], + "health": row["health"], + "politics": row["politics"], + "right": row["right"], + "science": row["science"], + } + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for i, row in dataset.iterrows(): + yield i, {"id": i, "text": row["text"], "label": reverse_encoding[np.argmax(row[_TAGS])]} diff --git a/seacrowd/sea_datasets/gklmip_sentiment/__init__.py b/seacrowd/sea_datasets/gklmip_sentiment/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/gklmip_sentiment/gklmip_sentiment.py b/seacrowd/sea_datasets/gklmip_sentiment/gklmip_sentiment.py new file mode 100644 index 000000000..28b958d22 --- /dev/null +++ b/seacrowd/sea_datasets/gklmip_sentiment/gklmip_sentiment.py @@ -0,0 +1,144 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@InProceedings{, +author="Jiang, Shengyi +and Huang, Xiuwen +and Cai, Xiaonan +and Lin, Nankai", +title="Pre-trained Models and Evaluation Data for the Myanmar Language", +booktitle="The 28th International Conference on Neural Information Processing", +year="2021", +publisher="Springer International Publishing", +address="Cham", +} +""" + +_DATASETNAME = "gklmip_sentiment" +_DESCRIPTION = """\ +The GKLMIP Product Sentiment Dataset is a Burmese dataset for sentiment analysis. \ +It was created by crawling comments on an e-commerce website. The sentiment labels range \ +from 1 to 5, with 1 and 2 being negative, 3 and 4 being neutral, and 5 being positive. +""" + +_HOMEPAGE = "https://github.com/GKLMIP/Pretrained-Models-For-Myanmar/tree/main" +_LANGUAGES = ["mya"] +_LICENSE = Licenses.UNKNOWN.value +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/GKLMIP/Pretrained-Models-For-Myanmar/raw/main/Product%20Sentiment%20Dataset.zip", +} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_LABELS = [1, 2, 3, 4, 5] + + +class GklmipSentimentDataset(datasets.GeneratorBasedBuilder): + """The GKLMIP Product Sentiment Dataset is a Burmese dataset for sentiment analysis.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"bpe": datasets.Value("string"), "text": datasets.Value("string"), "label": datasets.Value("string")}) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(_LABELS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "product_sentiment_dataset_train.json"), + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "product_sentiment_dataset_test.json"), + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "product_sentiment_dataset_dev.json"), + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + with open(filepath) as file: + dataset = json.load(file) + + if self.config.schema == "source": + for i, line in enumerate(dataset): + yield i, {"bpe": line["bpe"], "text": line["text"], "label": line["label"]} + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for i, line in enumerate(dataset): + yield i, {"id": i, "text": line["text"], "label": line["label"]} diff --git a/seacrowd/sea_datasets/globalwoz/__init__.py b/seacrowd/sea_datasets/globalwoz/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/globalwoz/globalwoz.py b/seacrowd/sea_datasets/globalwoz/globalwoz.py new file mode 100644 index 000000000..d612eff04 --- /dev/null +++ b/seacrowd/sea_datasets/globalwoz/globalwoz.py @@ -0,0 +1,226 @@ +import os +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import itertools + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@inproceedings{ding-etal-2022-globalwoz, + title = "{G}lobal{W}o{Z}: Globalizing {M}ulti{W}o{Z} to Develop Multilingual Task-Oriented Dialogue Systems", + author = "Ding, Bosheng and + Hu, Junjie and + Bing, Lidong and + Aljunied, Mahani and + Joty, Shafiq and + Si, Luo and + Miao, Chunyan", + editor = "Muresan, Smaranda and + Nakov, Preslav and + Villavicencio, Aline", + booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + month = may, + year = "2022", +} +""" + +_DATASETNAME = "globalwoz" + +_DESCRIPTION = """\ +This is the data of the paper “GlobalWoZ: Globalizing MultiWoZ to Develop Multilingual Task-Oriented Dialogue Systems” accepted by ACL 2022. The dataset contains several sub-datasets in 20 languages and 3 schemes (F&E, E&F, F&F), including Indonesian (id), Thai (th), and Vietnamese (vi) language. The method is based on translating dialogue templates and filling them with local entities in the target language countries. +""" + + +_HOMEPAGE = "https://github.com/bosheng2020/globalwoz" + + +_LANGUAGES = ["ind", "tha", "vie"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = True + +_URLS = {} + +_SUPPORTED_TASKS = [Tasks.E2E_TASK_ORIENTED_DIALOGUE] + +_SOURCE_VERSION = "2.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +def seacrowd_config_constructor(dial_type, lang, schema, version): + if dial_type not in ["EandF", "FandE", "FandF"]: + raise ValueError(f"Invalid dialogue type {dial_type}") + + if lang == "": + raise ValueError(f"Invalid lang {lang}") + + if schema not in ["source", "seacrowd_tod"]: + raise ValueError(f"Invalid schema: {schema}") + + return SEACrowdConfig( + name="globalwoz_{dial_type}_{lang}_{schema}".format(dial_type=dial_type, lang=lang, schema=schema), + version=datasets.Version(version), + description="GlobalWoZ schema for {schema}: {dial_type}_{lang}".format(schema=schema, dial_type=dial_type, lang=lang), + schema=schema, + subset_id="globalwoz_{dial_type}_{lang}".format(dial_type=dial_type, lang=lang), + ) + + +class GlobalWoZ(datasets.GeneratorBasedBuilder): + """This is the data of the paper “GlobalWoZ: Globalizing MultiWoZ to Develop Multilingual Task-Oriented Dialogue Systems” accepted by ACL 2022. + The dataset contains several sub-datasets in 20 languages and 3 schemes (F&E, E&F, F&F), including Indonesian (id), Thai (th), + and Vietnamese (vi) language. The method is based on translating dialogue templates and filling them with local entities in the target language countries. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + seacrowd_config_constructor(tod_format, lang, schema, _SOURCE_VERSION if schema == "source" else _SEACROWD_VERSION) for tod_format, lang, schema in itertools.product(("EandF", "FandE", "FandF"), ("id", "th", "vi"), ("source", "seacrowd_tod")) + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "goal": { + "attraction": datasets.Value("string"), + "hospital": datasets.Value("string"), + "hotel": datasets.Value("string"), + "police": datasets.Value("string"), + "restaurant": datasets.Value("string"), + "taxi": datasets.Value("string"), + "train": datasets.Value("string"), + }, + "log": [ + { + "dialog_act": datasets.Value("string"), + "metadata": datasets.Value("string"), + "span_info": [[datasets.Value("string")]], + "text": datasets.Value("string"), + } + ], + } + ) + + elif self.config.schema == "seacrowd_tod": + features = schemas.tod_features + else: + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + _split_generators = [] + + type_and_lang = {"dial_type": self.config.subset_id.split("_")[1].replace("and", "&"), "lang": self.config.subset_id.split("_")[2]} # globalwoz_{dial_type}_{lang} + + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + if not os.path.exists(os.path.join(data_dir, f"{type_and_lang['dial_type']}_{type_and_lang['lang']}.json")): + raise FileNotFoundError() + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + # "filepath": data_dir + f"_{type_and_lang['dial_type']}_{type_and_lang['lang']}.json", + "filepath": os.path.join(data_dir, f"{type_and_lang['dial_type']}_{type_and_lang['lang']}.json"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # For local datasets you will have access to self.config.data_dir and self.config.data_files + with open(filepath, "r+", encoding="utf8") as fw: + data = json.load(fw) + + if self.config.schema == "source": + for idx, tod_dialogue in enumerate(data.values()): + example = {} + example["id"] = str(idx) + example["goal"] = {} + + for goal_key in ["attraction", "hospital", "hotel", "police", "restaurant", "taxi", "train"]: + example["goal"][goal_key] = json.dumps(tod_dialogue["goal"][goal_key]) + example["log"] = [] + + for dial_log in tod_dialogue["log"]: + dial = {} + dial["dialog_act"] = json.dumps(dial_log["dialog_act"]) + dial["metadata"] = json.dumps(dial_log["metadata"]) + for i in range(len(dial_log["span_info"])): + for j in range(len(dial_log["span_info"][i])): + dial_log["span_info"][i][j] = str(dial_log["span_info"][i][j]) # casting to str + dial["span_info"] = [[str(span)] if isinstance(span, str) else span for span in dial_log["span_info"]] + dial["text"] = dial_log["text"] + + example["log"].append(dial) + + yield example["id"], example + + elif self.config.schema == "seacrowd_tod": + for idx, tod_dialogue in enumerate(data.values()): + example = {} + example["dialogue_idx"] = idx + + dialogue = [] + # NOTE: the dialogue always started with `user` as first utterance + for turn, i in enumerate(range(0, len(tod_dialogue["log"]) + 2, 2)): + dial = {} + dial["turn_idx"] = turn + + # system_utterance properties + dial["system_utterance"] = "" + dial["system_acts"] = [] + if turn != 0: + dial["system_utterance"] = tod_dialogue["log"][i - 1]["text"] + if i < len(tod_dialogue["log"]): + # NOTE: "system_acts will be populated with the `dialog_act` from the user utterance in the original dataset, as our schema dictates + # that `system_acts` should represent the system's intended actions based on the user's utterance." + for acts in tod_dialogue["log"][i]["dialog_act"].values(): + for act in acts: + dial["system_acts"].append([act[0]]) + + # user_utterance properties + dial["turn_label"] = [] # left as an empty array + dial["belief_state"] = [] + if i == len(tod_dialogue["log"]): + # case if turn_idx > len(dialogue) --> add dummy user_utterance + dial["user_utterance"] = "" + else: + dial["user_utterance"] = tod_dialogue["log"][i]["text"] + # NOTE: "the belief_state will be populated with the `span_info` from the user utterance in the original dataset, as our schema dictates + # that `belief_state` should represent the system's belief state based on the user's utterance." + for span in tod_dialogue["log"][i]["span_info"]: + if span[0].split("-")[1] == "request": # Request action + dial["belief_state"].append({"slots": [["slot", span[1]]], "act": "request"}) + else: + dial["belief_state"].append({"slots": [[span[1], span[2]]], "act": span[0].split("-")[1]}) + + # append to dialogue + dialogue.append(dial) + + example["dialogue"] = dialogue + + yield example["dialogue_idx"], example diff --git a/seacrowd/sea_datasets/glotstorybook/glotstorybook.py b/seacrowd/sea_datasets/glotstorybook/glotstorybook.py index e750a89e1..b89ff0d90 100644 --- a/seacrowd/sea_datasets/glotstorybook/glotstorybook.py +++ b/seacrowd/sea_datasets/glotstorybook/glotstorybook.py @@ -39,10 +39,13 @@ 'CC BY-NC-SA', 'CC-BY', 'CC-BY-NC', and 'Public Domain'. We also license the code, actual packaging and the metadata of these data under the cc0-1.0. """ + +_LOCAL=False +_LANGUAGES = ["khg", "khm", "mya", "tet", "tha", "vie"] + _URLS = "https://huggingface.co/datasets/cis-lmu/GlotStoryBook/resolve/main/GlotStoryBook.csv" _SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] -_SUPPORTED_LANGS = ["khg", "khm", "mya", "tet", "tha", "vie"] _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" @@ -117,7 +120,7 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" df = pd.read_csv(filepath) - df = df[df["ISO639-3"].isin(_SUPPORTED_LANGS)] + df = df[df["ISO639-3"].isin(_LANGUAGES)] if self.config.schema == "source": for i, row in df.iterrows(): diff --git a/seacrowd/sea_datasets/gnome/__init__.py b/seacrowd/sea_datasets/gnome/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/gnome/gnome.py b/seacrowd/sea_datasets/gnome/gnome.py new file mode 100644 index 000000000..3c79e57a2 --- /dev/null +++ b/seacrowd/sea_datasets/gnome/gnome.py @@ -0,0 +1,190 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import requests + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import SCHEMA_TO_FEATURES, TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = r"""\ +@inproceedings{tiedemann-2012-parallel, + title = "Parallel Data, Tools and Interfaces in {OPUS}", + author = {Tiedemann, J{\"o}rg}, + editor = "Calzolari, Nicoletta and + Choukri, Khalid and + Declerck, Thierry and + Do{\u{g}}an, Mehmet U{\u{g}}ur and + Maegaard, Bente and + Mariani, Joseph and + Moreno, Asuncion and + Odijk, Jan and + Piperidis, Stelios", + booktitle = "Proceedings of the Eighth International Conference on Language + Resources and Evaluation ({LREC}'12)", + month = may, + year = "2012", + address = "Istanbul, Turkey", + publisher = "European Language Resources Association (ELRA)", + url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/463_Paper.pdf", + pages = "2214--2218", + abstract = "This paper presents the current status of OPUS, a growing + language resource of parallel corpora and related tools. The focus in OPUS + is to provide freely available data sets in various formats together with + basic annotation to be useful for applications in computational linguistics, + translation studies and cross-linguistic corpus studies. In this paper, we + report about new data sets and their features, additional annotation tools + and models provided from the website and essential interfaces and on-line + services included in the project.", +} +""" + +_DATASETNAME = "gnome" + +_DESCRIPTION = """\ +A parallel corpus of GNOME localization files, which contains the interface text +in the GNU Network Object Model Environment (GNOME) and published by GNOME +translation teams. Text in this dataset is relatively short and technical. +""" + +_HOMEPAGE = "https://opus.nlpl.eu/GNOME/corpus/version/GNOME" + +_LANGUAGES = ["eng", "vie", "mya", "ind", "tha", "tgl", "zlm", "lao"] +_SUBSETS = ["en", "vi", "my", "id", "th", "tl", "ms", "lo"] +_SUBSET_PAIRS = [(src, tgt) for src in _SUBSETS for tgt in _SUBSETS if src != tgt] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + "api": "http://opus.nlpl.eu/opusapi/?source={src_lang}&target={tgt_lang}&corpus=GNOME&version=v1", + "data": "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/{lang_pair}.txt.zip", +} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] +_SEACROWD_SCHEMA = f"seacrowd_{TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower()}" # t2t + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class GnomeDataset(datasets.GeneratorBasedBuilder): + """A parallel corpus of GNOME localization files""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [] + for subset in _SUBSET_PAIRS: + lang_pair = f"{subset[0]}-{subset[1]}" + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang_pair}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} {lang_pair} source schema", + schema="source", + subset_id=lang_pair, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang_pair}_{_SEACROWD_SCHEMA}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {lang_pair} SEACrowd schema", + schema=_SEACROWD_SCHEMA, + subset_id=lang_pair, + ), + ] + + DEFAULT_CONFIG_NAME = ( + f"{_DATASETNAME}_{_SUBSET_PAIRS[0][0]}-{_SUBSET_PAIRS[0][1]}_source" + ) + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "source": datasets.Value("string"), + "target": datasets.Value("string"), + } + ) + elif self.config.schema == _SEACROWD_SCHEMA: + features = SCHEMA_TO_FEATURES[ + TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]] + ] # text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + src_lang, tgt_lang = self.config.subset_id.split("-") + api_url = _URLS["api"].format(src_lang=src_lang, tgt_lang=tgt_lang) + data_url = None + + response = requests.get(api_url, timeout=10) + if response: + corpora = response.json()["corpora"] + for corpus in corpora: + if ".txt.zip" in corpus["url"]: + data_url = corpus["url"] + break + else: + raise requests.exceptions.HTTPError( + f"Non-success status code: {response.status_code}" + ) + + if not data_url: + raise ValueError(f"No suitable corpus found, check {api_url}") + else: + lang_pair = data_url.split("/")[-1].split(".")[0] + data_dir = Path(dl_manager.download_and_extract(data_url)) + src_file = data_dir / f"GNOME.{lang_pair}.{src_lang}" + tgt_file = data_dir / f"GNOME.{lang_pair}.{tgt_lang}" + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "src_file": src_file, + "tgt_file": tgt_file, + }, + ), + ] + + def _generate_examples(self, src_file: Path, tgt_file: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(src_file, "r", encoding="utf-8") as src_f, open( + tgt_file, "r", encoding="utf-8" + ) as tgt_f: + for idx, (src_line, tgt_line) in enumerate(zip(src_f, tgt_f)): + if self.config.schema == "source": + yield idx, {"source": src_line.strip(), "target": tgt_line.strip()} + elif self.config.schema == _SEACROWD_SCHEMA: + yield idx, { + "id": str(idx), + "text_1": src_line.strip(), + "text_2": tgt_line.strip(), + "text_1_name": f"source ({src_file.name.split('.')[-1]})", + "text_2_name": f"target ({tgt_file.name.split('.')[-1]})", + } diff --git a/seacrowd/sea_datasets/iapp_squad/__init__.py b/seacrowd/sea_datasets/iapp_squad/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/iapp_squad/iapp_squad.py b/seacrowd/sea_datasets/iapp_squad/iapp_squad.py new file mode 100644 index 000000000..1de682e33 --- /dev/null +++ b/seacrowd/sea_datasets/iapp_squad/iapp_squad.py @@ -0,0 +1,128 @@ +# coding=utf-8 +import json + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_DATASETNAME = "iapp_squad" +_CITATION = """\ +@dataset +{ + kobkrit_viriyayudhakorn_2021_4539916, + author = {Kobkrit Viriyayudhakorn and Charin Polpanumas}, + title = {iapp_wiki_qa_squad}, + month = feb, + year = 2021, + publisher = {Zenodo}, + version = 1, + doi = {10.5281/zenodo.4539916}, + url = {https://doi.org/10.5281/zenodo.4539916} +} +""" + +_DESCRIPTION = """ +`iapp_wiki_qa_squad` is an extractive question answering dataset from Thai Wikipedia articles. +It is adapted from [the original iapp-wiki-qa-dataset](https://github.com/iapp-technology/iapp-wiki-qa-dataset) +to [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) format, resulting in +5761/742/739 questions from 1529/191/192 articles. +""" + +_HOMEPAGE = "https://github.com/iapp-technology/iapp-wiki-qa-dataset" +_LICENSE = Licenses.MIT.value +_HF_URL = " https://huggingface.co/datasets/iapp_wiki_qa_squad" +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] +_LOCAL = False +_LANGUAGES = ["tha"] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +_URLS = { + "train": "https://raw.githubusercontent.com/iapp-technology/iapp-wiki-qa-dataset/main/squad_format/data/train.jsonl", + "validation": "https://raw.githubusercontent.com/iapp-technology/iapp-wiki-qa-dataset/main/squad_format/data/valid.jsonl", + "test": "https://raw.githubusercontent.com/iapp-technology/iapp-wiki-qa-dataset/main/squad_format/data/test.jsonl", +} + + +class IappWikiQASquadDataset(datasets.GeneratorBasedBuilder): + BUILDER_CONFIGS = [ + SEACrowdConfig(name=f"{_DATASETNAME}_source", version=datasets.Version(_SOURCE_VERSION), description=_DESCRIPTION, subset_id=f"{_DATASETNAME}", schema="source"), + SEACrowdConfig(name=f"{_DATASETNAME}_seacrowd_qa", version=datasets.Version(_SEACROWD_VERSION), description=_DESCRIPTION, subset_id=f"{_DATASETNAME}", schema="seacrowd_qa"), + ] + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + "question_id": datasets.Value("string"), + "article_id": datasets.Value("string"), + "title": datasets.Value("string"), + "context": datasets.Value("string"), + "question": datasets.Value("string"), + "answers": datasets.features.Sequence( + { + "text": datasets.Value("string"), + "answer_start": datasets.Value("int32"), + "answer_end": datasets.Value("int32"), + } + ), + } + ) + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + features["meta"] = { + "answer_start": datasets.Value("int32"), + "answer_end": datasets.Value("int32"), + } + return datasets.DatasetInfo(description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, citation=_CITATION, license=_LICENSE) + + def _split_generators(self, dl_manager): + file_paths = dl_manager.download_and_extract(_URLS) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": file_paths["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": file_paths["validation"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": file_paths["test"]}, + ), + ] + + def _generate_examples(self, filepath): + """Yields examples.""" + with open(filepath, encoding="utf-8") as f: + for id_, row in enumerate(f): + data = json.loads(row) + if self.config.schema == "source": + yield id_, { + "question_id": data["question_id"], + "article_id": data["article_id"], + "title": data["title"], + "context": data["context"], + "question": data["question"], + "answers": { + "text": data["answers"]["text"], + "answer_start": data["answers"]["answer_start"], + "answer_end": data["answers"]["answer_end"], + }, + } + elif self.config.schema == "seacrowd_qa": + yield id_, { + "id": id_, + "question_id": data["question_id"], + "document_id": data["article_id"], + "question": data["question"], + "type": "abstractive", + "choices": [], + "context": data["context"], + "answer": data["answers"]["text"], + "meta": {"answer_start": data["answers"]["answer_start"][0], "answer_end": data["answers"]["answer_end"][0]}, + } diff --git a/seacrowd/sea_datasets/iatf/__init__.py b/seacrowd/sea_datasets/iatf/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/iatf/iatf.py b/seacrowd/sea_datasets/iatf/iatf.py new file mode 100644 index 000000000..bed387c36 --- /dev/null +++ b/seacrowd/sea_datasets/iatf/iatf.py @@ -0,0 +1,183 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pyreadr + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (TASK_TO_SCHEMA, Licenses, Tasks) + +_DATASETNAME = "iatf" + +_CITATION = """\ +@misc{ + iatf, + title={Inter-Agency Task Force for the Management of Emerging Infectious Diseases (IATF) COVID-19 Resolutions}, + url={https://como-ph.github.io/post/creating-text-data-from-iatf-resolutions/}, + author={Chris Mercado, John Robert Medina, Ernest Guevarra} +} +""" + +_DESCRIPTION = """\ +To assess possible impact of various COVID-19 prediction models on Philippine government response, text from various resolutions issued by +the Inter-agency Task Force for the Management of Emerging Infectious Diseases (IATF) has been collected using data mining approaches implemented in R. +""" + +_HOMEPAGE = "https://github.com/como-ph/covidphtext/tree/master/data" + +_LICENSE = Licenses.GPL_3_0.value + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] +_SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() +_LANGUAGES = ["fil"] +_LOCAL = False +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +_URL_BASE = "https://github.com/como-ph/covidphtext/raw/master/data/" +_URLS = [ + "iatfGuidelineOmnibus.rda", + "iatfResolution01.rda", + "iatfResolution02.rda", + "iatfResolution03.rda", + "iatfResolution04.rda", + "iatfResolution05.rda", + "iatfResolution06.rda", + "iatfResolution07.rda", + "iatfResolution08.rda", + "iatfResolution09.rda", + "iatfResolution10.rda", + "iatfResolution11.rda", + "iatfResolution12.rda", + "iatfResolution13.rda", + "iatfResolution14.rda", + "iatfResolution15.rda", + "iatfResolution16.rda", + "iatfResolution17.rda", + "iatfResolution18.rda", + "iatfResolution19.rda", + "iatfResolution20.rda", + "iatfResolution21.rda", + "iatfResolution22.rda", + "iatfResolution23.rda", + "iatfResolution24.rda", + "iatfResolution25.rda", + "iatfResolution26.rda", + "iatfResolution27.rda", + "iatfResolution28.rda", + "iatfResolution29.rda", + "iatfResolution30.rda", + "iatfResolution30A.rda", + "iatfResolution31.rda", + "iatfResolution32.rda", + "iatfResolution33.rda", + "iatfResolution34.rda", + "iatfResolution35.rda", + "iatfResolution36.rda", + "iatfResolution37.rda", + "iatfResolution38.rda", + "iatfResolution39.rda", + "iatfResolution40.rda", + "iatfResolution41.rda", + "iatfResolution42.rda", + "iatfResolution43.rda", + "iatfResolution44.rda", + "iatfResolution45.rda", + "iatfResolution46.rda", + "iatfResolution46A.rda", + "iatfResolution47.rda", + "iatfResolution48.rda", + "iatfResolution49.rda", + "iatfResolution50.rda", + "iatfResolution50A.rda", + "iatfResolution51.rda", + "iatfResolution52.rda", + "iatfResolution53.rda", + "iatfResolution54.rda", + "iatfResolution55.rda", + "iatfResolution55A.rda", + "iatfResolution56.rda", + "iatfResolution57.rda", + "iatfResolution58.rda", + "iatfResolution59.rda", + "iatfResolution60.rda", + "iatfResolution60A.rda", +] + + +class IATFDataset(datasets.GeneratorBasedBuilder): + """Inter-agency Task Force for the Management of Emerging Infectious Diseases Dataset""" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{_SEACROWD_SCHEMA_NAME}", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} seacrowd schema", + schema=f"seacrowd_{_SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{_SEACROWD_SCHEMA_NAME}": + features = schemas.self_supervised_pretraining.features + else: + raise ValueError(f"Invalid config schema: {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + filepaths = [Path(dl_manager.download(_URL_BASE + url)) for url in _URLS] + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepaths": filepaths}, + ), + ] + + def _generate_examples(self, filepaths: List[Path]) -> Tuple[int, Dict]: + counter = 0 + for path in filepaths: + data = pyreadr.read_r(path) + text = " ".join([str(x) for x in data[list(data.keys())[0]]["text"].values]) + if self.config.schema == "source": + yield ( + counter, + { + "id": str(counter), + "text": text.strip(), + }, + ) + elif self.config.schema == f"seacrowd_{_SEACROWD_SCHEMA_NAME}": + yield ( + counter, + { + "id": str(counter), + "text": text.strip(), + }, + ) + + counter += 1 diff --git a/seacrowd/sea_datasets/icon/__init__.py b/seacrowd/sea_datasets/icon/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/icon/icon.py b/seacrowd/sea_datasets/icon/icon.py new file mode 100644 index 000000000..ef7149573 --- /dev/null +++ b/seacrowd/sea_datasets/icon/icon.py @@ -0,0 +1,216 @@ +import dataclasses +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import datasets +import nltk +from nltk import Tree +from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, + DEFAULT_SOURCE_VIEW_NAME, Licenses, + Tasks) + +_DATASETNAME = "icon" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME +_CITATION = """\ +@inproceedings{lim2023icon, + title={ICON: Building a Large-Scale Benchmark Constituency Treebank for the Indonesian Language}, + author={Lim, Ee Suan and Leong, Wei Qi and Nguyen, Ngan Thanh and Adhista, Dea and Kng, Wei Ming and Tjh, William Chandra and Purwarianti, Ayu}, + booktitle={Proceedings of the 21st International Workshop on Treebanks and Linguistic Theories (TLT, GURT/SyntaxFest 2023)}, + pages={37--53}, + year={2023} +} +""" + +_DESCRIPTION = """\ +ICON (Indonesian CONstituency treebank) is a large-scale high-quality constituency treebank (10000 sentences) +for the Indonesian language, sourced from Wikipedia and news data from Tempo, spanning the period from 1971 to 2016. +The annotation guidelines were formulated with the Penn Treebank POS tagging and bracketing guidelines as a reference, +with additional adaptations to account for the characteristics of the Indonesian language. +""" + +_HOMEPAGE = "https://github.com/aisingapore/seacorenlp-data/tree/main/id/constituency" + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value +_LANGUAGES = ["ind"] +_LOCAL = False +_URLS = { + "train": "https://raw.githubusercontent.com/aisingapore/seacorenlp-data/main/id/constituency/train.txt", + "validation": "https://raw.githubusercontent.com/aisingapore/seacorenlp-data/main/id/constituency/dev.txt", + "test": "https://raw.githubusercontent.com/aisingapore/seacorenlp-data/main/id/constituency/test.txt", +} + +_SUPPORTED_TASKS = [Tasks.CONSTITUENCY_PARSING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ICONDataset(datasets.GeneratorBasedBuilder): + + BUILDER_CONFIGS = [ + SEACrowdConfig(name=f"{_DATASETNAME}_source", version=datasets.Version(_SOURCE_VERSION), description=_DESCRIPTION, schema="source", subset_id=f"{_DATASETNAME}"), + SEACrowdConfig(name=f"{_DATASETNAME}_seacrowd_tree", version=datasets.Version(_SEACROWD_VERSION), description=_DESCRIPTION, schema="seacrowd_tree", subset_id=f"{_DATASETNAME}"), + ] + + DEFAULT_CONFIG_NAME = "icon_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "index": datasets.Value("string"), # index + "tree": datasets.Value("string"), # nltk.tree + "sentence": datasets.Value("string"), # bracketed sentence tree + "words": datasets.Sequence(datasets.Value("string")), # words + "POS": datasets.Sequence(datasets.Value("string")), # pos-tags + } + ) + elif self.config.schema == "seacrowd_tree": + features = schemas.tree_features + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + + train_txt = Path(dl_manager.download_and_extract(_URLS["train"])) + dev_txt = Path(dl_manager.download_and_extract(_URLS["validation"])) + test_txt = Path(dl_manager.download_and_extract(_URLS["test"])) + + data_dir = { + "train": train_txt, + "validation": dev_txt, + "test": test_txt, + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir["train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir["test"], + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir["validation"], + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + trees = nltk_load_trees(filepath) + if self.config.schema == "source": + for idx, tree in enumerate(trees): + ex = {"index": str(idx), "tree": tree.tree, "words": tree.words, "sentence": tree.bra_sent, "POS": [itm[1] for itm in tree.pos()]} + yield idx, ex + if self.config.schema == "seacrowd_tree": + for idx, tree in enumerate(trees): + ex = get_node_char_indices_with_ids(tree.tree, str(idx)) + yield idx, ex + + +class BaseInputExample(ABC): + """Parser input for a single sentence (abstract interface).""" + + words: List[str] + space_after: List[bool] + tree: Optional[nltk.Tree] + + @abstractmethod + def leaves(self) -> Optional[List[str]]: + """Returns leaves to use in the parse tree.""" + pass + + @abstractmethod + def pos(self) -> Optional[List[Tuple[str, str]]]: + """Returns a list of (leaf, part-of-speech tag) tuples.""" + pass + + +@dataclasses.dataclass +class ParsingExample(BaseInputExample): + """A single parse tree and sentence.""" + + words: List[str] + bra_sent: str + tree: Optional[nltk.Tree] = None + _pos: Optional[List[Tuple[str, str]]] = None + + def leaves(self) -> Optional[List[str]]: + return self.tree.leaves() if self.tree else None + + def pos(self) -> Optional[List[Tuple[str, str]]]: + return self.tree.pos() if self.tree else self._pos + + def without_gold_annotations(self) -> "ParsingExample": + return dataclasses.replace(self, tree=None, _pos=self.pos()) + + +def nltk_load_trees(const_path: str) -> List[ParsingExample]: + reader = BracketParseCorpusReader("", [const_path]) + trees = reader.parsed_sents() + with open(const_path, "r") as filein: + bracketed_sentences = [itm.strip() for itm in filein.readlines()] + sents = [tree.leaves() for tree in trees] + assert len(trees) == len(sents) == len(bracketed_sentences), f"Number Mismatched: {len(trees)} vs {len(bracketed_sentences)}" + treebank = [ParsingExample(tree=tree, words=words, bra_sent=bra_sent) for tree, bra_sent, words, in zip(trees, bracketed_sentences, sents)] + for example in treebank: + assert len(example.words) == len(example.leaves()), "Token count mismatch." + return treebank + + +def get_node_char_indices_with_ids(tree, sent_id): + def traverse_tree(subtree, start_index): + nonlocal node_id + current_id = node_id + node_id += 1 + node_text = " ".join(subtree.leaves()) + end_index = start_index + len(node_text) + + # Record the current node + node_data = { + "id": f"{sent_id}_{current_id}", + "type": subtree.label(), + "text": node_text, + "offsets": [start_index, end_index], + "subnodes": [], + } + node_indices.append(node_data) + + for child in subtree: + if isinstance(child, Tree): + child_id = traverse_tree(child, start_index) + node_data["subnodes"].append(child_id) + start_index += len(" ".join(child.leaves())) + 1 + return f"{sent_id}_{current_id}" + + node_indices = [] + node_id = 0 + traverse_tree(tree, 0) + sentence = " ".join(tree.leaves()) + passage = {"id": "p" + sent_id, "type": None, "text": tree.leaves(), "offsets": [0, len(sentence)]} + return {"id": "s" + sent_id, "passage": passage, "nodes": node_indices} diff --git a/seacrowd/sea_datasets/id_coreference_resolution/__init__.py b/seacrowd/sea_datasets/id_coreference_resolution/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/id_msvd/__init__.py b/seacrowd/sea_datasets/id_msvd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/id_msvd/id_msvd.py b/seacrowd/sea_datasets/id_msvd/id_msvd.py new file mode 100644 index 000000000..6e0bd928a --- /dev/null +++ b/seacrowd/sea_datasets/id_msvd/id_msvd.py @@ -0,0 +1,134 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (SCHEMA_TO_FEATURES, TASK_TO_SCHEMA, + Licenses, Tasks) + +_CITATION = """\ +@article{hendria2023msvd, + author = {Willy Fitra Hendria}, + title = {MSVD-Indonesian: A Benchmark for Multimodal Video-Text Tasks in Indonesian}, + journal = {arXiv preprint arXiv:2306.11341}, + year = {2023}, + url = {https://arxiv.org/abs/2306.11341}, +} +""" + +_DATASETNAME = "id_msvd" +_DESCRIPTION = """\ +MSVD-Indonesian is derived from the MSVD (Microsoft Video Description) dataset, which is +obtained with the help of a machine translation service (Google Translate API). This +dataset can be used for multimodal video-text tasks, including text-to-video retrieval, +video-to-text retrieval, and video captioning. Same as the original English dataset, the +MSVD-Indonesian dataset contains about 80k video-text pairs. +""" + +_HOMEPAGE = "https://github.com/willyfh/msvd-indonesian" +_LANGUAGES = ["ind"] +_LICENSE = Licenses.MIT.value +_URLS = {"text": "https://raw.githubusercontent.com/willyfh/msvd-indonesian/main/data/MSVD-indonesian.txt", "video": "https://www.cs.utexas.edu/users/ml/clamp/videoDescription/YouTubeClips.tar"} + +_SUPPORTED_TASKS = [Tasks.VIDEO_TO_TEXT_RETRIEVAL] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class IdMsvdDataset(datasets.GeneratorBasedBuilder): + """MSVD dataset with Indonesian translation.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() # "vidtext" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "video_path": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = SCHEMA_TO_FEATURES[self.SEACROWD_SCHEMA_NAME.upper()] # video_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + # expect several minutes to download video data ~1.7GB + data_path = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "text_path": Path(data_path["text"]), + "video_path": Path(data_path["video"]) / "YouTubeClips", + "split": "train", + }, + ), + ] + + def _generate_examples(self, text_path: Path, video_path: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + text_data = [] + with open(text_path, "r", encoding="utf-8") as f: + for line in f: + id = line.find(" ") + video = line[:id] + text = line[id + 1 :].strip() + text_data.append([video, text]) + + df = pd.DataFrame(text_data, columns=["video_path", "text"]) + df["video_path"] = df["video_path"].apply(lambda x: video_path / f"{x}.avi") + + if self.config.schema == "source": + for i, row in df.iterrows(): + yield i, { + "video_path": str(row["video_path"]), + "text": row["text"], + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for i, row in df.iterrows(): + yield i, { + "id": str(i), + "video_path": str(row["video_path"]), + "text": row["text"], + "metadata": { + "resolution": { + "width": None, + "height": None, + }, + "duration": None, + "fps": None, + }, + } diff --git a/seacrowd/sea_datasets/id_newspaper_2018/__init__.py b/seacrowd/sea_datasets/id_newspaper_2018/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/id_newspaper_2018/id_newspaper_2018.py b/seacrowd/sea_datasets/id_newspaper_2018/id_newspaper_2018.py new file mode 100644 index 000000000..3e0ebf190 --- /dev/null +++ b/seacrowd/sea_datasets/id_newspaper_2018/id_newspaper_2018.py @@ -0,0 +1,150 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@misc{feryandi2018, + author={Nurdiantoro, Feryandi} + title={Dataset-Artikel}, + year = {2018}, + url = {https://github.com/feryandi/Dataset-Artikel}, +} +""" + +_DATASETNAME = "id_newspaper_2018" + +_DESCRIPTION = """\ +The ID Newspapers 2018 dataset provides 500K articles from various Indonesian news sources. Articles were taken from +7 primary sources (Detik, Kompas, Tempo, CNN Indonesia, Sindo, Republika, Poskota). The compressed files can be +retrieved from datahttps://huggingface.co/datasets/indonesian-nlp/id_newspapers_2018. +""" + +_HOMEPAGE = "https://github.com/feryandi/Dataset-Artikel" + +_LANGUAGES = ["ind"] + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_LOCAL = False + +_URLS = "https://huggingface.co/datasets/indonesian-nlp/id_newspapers_2018/resolve/main/newspapers-json.tgz" + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class IDNewspapers2018Dataset(datasets.GeneratorBasedBuilder): + """ + ID Newspapers 2018 is a pretraining dataset from https://huggingface.co/datasets/indonesian-nlp/id_newspapers_2018. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_ssp", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_ssp", + subset_id=f"{_DATASETNAME}", + ), + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"url": datasets.Value("string"), "date": datasets.Value("string"), "title": datasets.Value("string"), "content": datasets.Value("string")}) + elif self.config.schema == "seacrowd_ssp": + features = schemas.ssp_features + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + path = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "path": path, + "split": "train", + }, + ) + ] + + def _generate_examples(self, path: Path, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + file_paths = [] + for path, subdirs, files in os.walk(path): + for name in files: + if name[-5:] == ".json": + file_paths.append(os.path.join(path, name)) + + for idx, file_path in enumerate(file_paths): + with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + if self.config.schema == "source": + x = { + "url": data["url"], + "date": data["date"], + "title": data["title"], + "content": data["content"], + } + yield idx, x + + elif self.config.schema == "seacrowd_ssp": + x = { + "id": str(idx), + "text": data["content"], + } + yield idx, x + + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") diff --git a/seacrowd/sea_datasets/id_sent_emo_mobile_apps/__init__.py b/seacrowd/sea_datasets/id_sent_emo_mobile_apps/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/id_sent_emo_mobile_apps/id_sent_emo_mobile_apps.py b/seacrowd/sea_datasets/id_sent_emo_mobile_apps/id_sent_emo_mobile_apps.py new file mode 100644 index 000000000..543aea4f8 --- /dev/null +++ b/seacrowd/sea_datasets/id_sent_emo_mobile_apps/id_sent_emo_mobile_apps.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@article{riccosan2023, + author = {Riccosan and Saputra, Karen Etania}, + title = {Multilabel multiclass sentiment and emotion dataset from indonesian mobile application review}, + journal = {Data in Brief}, + volume = {50}, + year = {2023}, + doi = {10.1016/j.dib.2023.109576}, +} +""" + +_LOCAL = False +_LANGUAGES = ["ind"] +_DATASETNAME = "id_sent_emo_mobile_apps" +_DESCRIPTION = """ +This dataset contains manually annotated public reviews of mobile applications in Indonesia. +Each review is given a sentiment label (positive, negative, neutral) and +an emotion label (anger, sadness, fear, happiness, love, neutral). +""" +_HOMEPAGE = "https://github.com/Ricco48/Multilabel-Sentiment-and-Emotion-Dataset-from-Indonesian-" "Mobile-Application-Review/tree/CreateCodeForPaper" +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value +_URL = ( + "https://github.com/Ricco48/Multilabel-Sentiment-and-Emotion-Dataset-from-Indonesian-Mobile-Application-Review/raw/CreateCodeForPaper/" + "Multilabel%20Sentiment%20and%20Emotion%20Dataset%20from%20Indonesian%20Mobile%20Application%20Review/Multilabel%20Sentiment%20and%20Emotion" + "%20Dataset%20from%20Indonesian%20Mobile%20Application%20Review.csv" +) + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS, Tasks.EMOTION_CLASSIFICATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class EmoSentIndMobile(datasets.GeneratorBasedBuilder): + """Dataset of Indonesian mobile application reviews manually annotated for emotion and sentiment.""" + + SUBSETS = ["emotion", "sentiment"] + EMOTION_CLASS_LABELS = ["Sad", "Anger", "Fear", "Happy", "Love", "Neutral"] + SENTIMENT_CLASS_LABELS = ["Negative", "Positive", "Neutral"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME + ) + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for {subset} subset", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SUBSETS + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "content": datasets.Value("string"), + "sentiment": datasets.Value("string"), + "emotion": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_text": + if "emotion" in self.config.subset_id: + labels = self.EMOTION_CLASS_LABELS + elif "sentiment" in self.config.subset_id: + labels = self.SENTIMENT_CLASS_LABELS + features = schemas.text_features(label_names=labels) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + fp = dl_manager.download(_URL) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": fp}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + df = pd.read_csv(filepath, sep="\t", index_col=None) + for index, row in df.iterrows(): + if self.config.schema == "source": + example = { + "content": row["content"], + "sentiment": row["Sentiment"].title(), + "emotion": row["Emotion"].title(), + } + elif self.config.schema == "seacrowd_text": + if "emotion" in self.config.subset_id: + label = row["Emotion"] + elif "sentiment" in self.config.subset_id: + label = row["Sentiment"] + example = {"id": str(index), "text": row["content"], "label": label.title()} + yield index, example diff --git a/seacrowd/sea_datasets/id_sentiment_analysis/__init__.py b/seacrowd/sea_datasets/id_sentiment_analysis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/id_sentiment_analysis/id_sentiment_analysis.py b/seacrowd/sea_datasets/id_sentiment_analysis/id_sentiment_analysis.py new file mode 100644 index 000000000..626eb9646 --- /dev/null +++ b/seacrowd/sea_datasets/id_sentiment_analysis/id_sentiment_analysis.py @@ -0,0 +1,162 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = """\ +@misc{ridife2019idsa, + author = {Fe, Ridi}, + title = {Indonesia Sentiment Analysis Dataset}, + year = {2019}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\\url{https://github.com/ridife/dataset-idsa}} +} +""" + +_DATASETNAME = "id_sentiment_analysis" + +_DESCRIPTION = """\ +This dataset consists of 10806 labeled Indonesian tweets with their corresponding sentiment analysis: positive, negative, and neutral, up to 2019. +This dataset was developed in Cloud Experience Research Group, Gadjah Mada University. +There is no further explanation of the dataset. Contributor found this dataset after skimming through "Sentiment analysis of Indonesian datasets based on a hybrid deep-learning strategy" (Lin CH and Nuha U, 2023). +""" + +_HOMEPAGE = "https://ridi.staff.ugm.ac.id/2019/03/06/indonesia-sentiment-analysis-dataset/" + +_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/ridife/dataset-idsa/master/Indonesian%20Sentiment%20Twitter%20Dataset%20Labeled.csv", +} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] +_SUPPORTED_SCHEMA_STRINGS = [f"seacrowd_{str(TASK_TO_SCHEMA[task]).lower()}" for task in _SUPPORTED_TASKS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class IdSentimentAnalysis(datasets.GeneratorBasedBuilder): + """This dataset consists of 10806 labeled Indonesian tweets with their corresponding sentiment analysis: positive, negative, and neutral, up to 2019.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + ] + + seacrowd_schema_config: List[SEACrowdConfig] = [] + + for seacrowd_schema in _SUPPORTED_SCHEMA_STRINGS: + + seacrowd_schema_config.append( + SEACrowdConfig( + name=f"{_DATASETNAME}_{seacrowd_schema}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {seacrowd_schema} schema", + schema=f"{seacrowd_schema}", + subset_id=f"{_DATASETNAME}", + ) + ) + + BUILDER_CONFIGS.extend(seacrowd_schema_config) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "sentimen": datasets.Value("int32"), + "tweet": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{str(TASK_TO_SCHEMA[Tasks.SENTIMENT_ANALYSIS]).lower()}": + features = schemas.text_features(label_names=[1, -1, 0]) + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + path = dl_manager.download_and_extract(_URLS[_DATASETNAME]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "path": path, + }, + ), + ] + + def _generate_examples(self, path: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + idx = 0 + + if self.config.schema == "source": + df = pd.read_csv(path, delimiter="\t") + + df.rename(columns={"Tweet": "tweet"}, inplace=True) + + for _, row in df.iterrows(): + yield idx, row.to_dict() + idx += 1 + + elif self.config.schema == f"seacrowd_{str(TASK_TO_SCHEMA[Tasks.SENTIMENT_ANALYSIS]).lower()}": + df = pd.read_csv(path, delimiter="\t") + + df["id"] = df.index + df.rename(columns={"sentimen": "label"}, inplace=True) + df.rename(columns={"Tweet": "text"}, inplace=True) + + for _, row in df.iterrows(): + yield idx, row.to_dict() + idx += 1 + + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/id_vaccines_tweets/__init__.py b/seacrowd/sea_datasets/id_vaccines_tweets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/id_vaccines_tweets/id_vaccines_tweets.py b/seacrowd/sea_datasets/id_vaccines_tweets/id_vaccines_tweets.py new file mode 100644 index 000000000..6cb0859b5 --- /dev/null +++ b/seacrowd/sea_datasets/id_vaccines_tweets/id_vaccines_tweets.py @@ -0,0 +1,113 @@ +import datasets +import pandas + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, + DEFAULT_SOURCE_VIEW_NAME, Tasks, Licenses) + +_DATASETNAME = "id_vaccines_tweets" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME + +_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LOCAL = False +_CITATION = """\ +@article{febriyanti2021analisis, + title={ANALISIS SENTIMEN MASYARAKAT INDONESIA TERHADAP PELAKSANAAN VAKSIN COVID'19}, + author={Febriyanti, Syintya and Nursidah, Dea Ratu and Gustiara, Dela and Yulianti, Rika}, + journal={Khazanah: Jurnal Mahasiswa}, + volume={13}, + number={2}, + year={2021} +} +""" + +_DESCRIPTION = """\ +Dataset containing tweets about COVID-19 vaccines with manually labelled information about whether they are a +subjective tweet and their sentiment polarity. Tweets are from 20-27 June 2021 and 15-22 July 2021. +""" + +_HOMEPAGE = "https://github.com/rayendito/id-vaccines-tweets" + +_LICENSE = Licenses.UNKNOWN.value + +_URL = "https://raw.githubusercontent.com/rayendito/id-vaccines-tweets/main/id_vaccines_tweets.csv" + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class IdVaccinesTweetsDataset(datasets.GeneratorBasedBuilder): + """This is a seacrowd dataloader for id_vaccines_tweets, for every example in the dataset, it contains a subjective + tweet and their sentiment polarity. Tweets are from 20-27 June 2021 and 15-22 July 2021.""" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=_DESCRIPTION, + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=_DESCRIPTION, + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + "idx": datasets.Value("string"), + "form_text": datasets.Value("string"), + "norm_text": datasets.Value("string"), + "subjective": datasets.Value("float"), + "sentiment": datasets.Value("float"), + } + ) + elif self.config.schema == "seacrowd_text": + features = schemas.text_features([-1.0, 0.0, 1.0]) + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """ "return splitGenerators""" + downloaded_files = dl_manager.download_and_extract(_URL) + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files})] + + def _generate_examples(self, filepath): + data_lines = pandas.read_csv(filepath, skip_blank_lines=True) + keys = data_lines.keys() + indexes = data_lines[keys[0]][1:] + norms = data_lines[keys[1]][1:] + formals = data_lines[keys[2]][1:] + subjs = data_lines[keys[3]][1:] + posnegs = data_lines[keys[4]][1:] + if self.config.schema == "source": + for idx, (ind, norm, form, subj, posneg) in enumerate(zip(indexes, norms, formals, subjs, posnegs)): + yield idx, { + "idx": str(ind), + "form_text": form, + "norm_text": norm, + "subjective": float(subj), + "sentiment": float(posneg), + } + if self.config.schema == "seacrowd_text": + for idx, (ind, norm, posneg) in enumerate(zip(indexes, norms, posnegs)): + yield idx, {"id": str(ind), "text": norm, "label": float(posneg)} diff --git a/seacrowd/sea_datasets/id_wsd/__init__.py b/seacrowd/sea_datasets/id_wsd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/id_wsd/id_wsd.py b/seacrowd/sea_datasets/id_wsd/id_wsd.py index 2b48f9096..f51a928c1 100644 --- a/seacrowd/sea_datasets/id_wsd/id_wsd.py +++ b/seacrowd/sea_datasets/id_wsd/id_wsd.py @@ -31,7 +31,7 @@ _LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) _LOCAL = False -_DATASETNAME = "indonesian_wsd" +_DATASETNAME = "id_wsd" _DESCRIPTION = """\ Word Sense Disambiguation (WSD) is a task to determine the correct sense of an ambiguous word. @@ -88,18 +88,18 @@ class IndonesianWSD(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [ SEACrowdConfig( - name="indonesian_wsd_source", + name="id_wsd_source", version=SOURCE_VERSION, description="Indonesian WSD source schema", schema="source", - subset_id="indonesian_wsd", + subset_id="id_wsd", ), SEACrowdConfig( - name="indonesian_wsd_seacrowd_t2t", + name="id_wsd_seacrowd_t2t", version=SEACROWD_VERSION, description="Indonesian WSD Nusantara schema", schema="seacrowd_t2t", - subset_id="indonesian_wsd", + subset_id="id_wsd", ), ] diff --git a/seacrowd/sea_datasets/identifikasi_bahasa/__init__.py b/seacrowd/sea_datasets/identifikasi_bahasa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/identifikasi_bahasa/identifikasi_bahasa.py b/seacrowd/sea_datasets/identifikasi_bahasa/identifikasi_bahasa.py new file mode 100644 index 000000000..4305a5874 --- /dev/null +++ b/seacrowd/sea_datasets/identifikasi_bahasa/identifikasi_bahasa.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{Tuhenay2021, + title = {Perbandingan Klasifikasi Bahasa Menggunakan Metode Naïve Bayes Classifier (NBC) Dan Support Vector Machine (SVM)}, + volume = {4}, + ISSN = {2656-1948}, + url = {http://dx.doi.org/10.33387/jiko.v4i2.2958}, + DOI = {10.33387/jiko.v4i2.2958}, + number = {2}, + journal = {JIKO (Jurnal Informatika dan Komputer)}, + publisher = {LPPM Universitas Khairun}, + author = {Tuhenay, Deglorians}, + year = {2021}, + month = aug, + pages = {105-111} +} +""" + +_DATASETNAME = "identifikasi_bahasa" + +_DESCRIPTION = """\ +The identifikasi-bahasa dataset includes text samples in Indonesian, Ambonese, and Javanese. \ +Each entry is comprised of cleantext, representing the sentence content, and a label identifying the language. \ +The manual input process involved grouping the data by language categories, \ +with labels for language identification and cleantext representing sentence content. The dataset, excluding punctuation and numbers, \ +consists of a minimum of 3,000 Ambonese, 10,000 Javanese, \ +and 3,500 Indonesian language entries, meeting the research's minimum standard for effective language identification. +""" + +_HOMEPAGE = "https://github.com/joanitolopo/identifikasi-bahasa" +_LANGUAGES = ["ind", "jav", "abs"] + +_LICENSE = Licenses.APACHE_2_0.value +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/joanitolopo/identifikasi-bahasa/raw/main/DataKlasifikasi.xlsx", +} + +_SUPPORTED_TASKS = [Tasks.LANGUAGE_IDENTIFICATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" +_TAGS = ["Ambon", "Indo", "Jawa"] + + +class IdentifikasiBahasaDataset(datasets.GeneratorBasedBuilder): + """The "identifikasi-bahasa" dataset, manually grouped by language, \ + contains labeled Indonesian, Ambonese, and Javanese text entries, excluding \ + punctuation and numbers, with a minimum of 3,000 Ambonese, 10,000 Javanese, \ + and 3,500 Indonesian entries for effective language identification.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"cleanText": datasets.Value("string"), "label": datasets.Value("string")}) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(_TAGS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + dataset = pd.read_excel(filepath) + + if self.config.schema == "source": + for i, row in dataset.iterrows(): + yield i, {"cleanText": row["cleanText"], "label": row["label"]} + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for i, row in dataset.iterrows(): + yield i, {"id": i, "text": row["cleanText"], "label": row["label"]} diff --git a/seacrowd/sea_datasets/ind_proner/ind_proner.py b/seacrowd/sea_datasets/ind_proner/ind_proner.py new file mode 100644 index 000000000..eb7a15e5f --- /dev/null +++ b/seacrowd/sea_datasets/ind_proner/ind_proner.py @@ -0,0 +1,191 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.common_parser import load_conll_data +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@INPROCEEDINGS{9212879, + author={Akmal, Muhammad and Romadhony, Ade}, + booktitle={2020 International Conference on Data Science and Its Applications (ICoDSA)}, + title={Corpus Development for Indonesian Product Named Entity Recognition Using Semi-supervised Approach}, + year={2020}, + volume={}, + number={}, + pages={1-5}, + keywords={Feature extraction;Labeling;Buildings;Semisupervised learning;Training data;Text recognition;Manuals;proner;semi-supervised learning;crf}, + doi={10.1109/ICoDSA50139.2020.9212879} +} +""" + +_DATASETNAME = "ind_proner" + +_DESCRIPTION = """\ +Indonesian PRONER is a corpus for Indonesian product named entity recognition . It contains data was labeled manually +and data that was labeled automatically through a semi-supervised learning approach of conditional random fields (CRF). +""" + +_HOMEPAGE = "https://github.com/dziem/proner-labeled-text" + +_LANGUAGES = {"ind": "id"} + +_LANGUAGE_CODES = list(_LANGUAGES.values()) + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = { + "automatic": "https://raw.githubusercontent.com/dziem/proner-labeled-text/master/automatically_labeled.tsv", + "manual": "https://raw.githubusercontent.com/dziem/proner-labeled-text/master/manually_labeled.tsv", +} + +_ANNOTATION_TYPES = list(_URLS.keys()) +_ANNOTATION_IDXS = {"l1": 0, "l2": 1} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +logger = datasets.logging.get_logger(__name__) + + +class IndPRONERDataset(datasets.GeneratorBasedBuilder): + """ + Indonesian PRONER is a product named entity recognition dataset from https://github.com/dziem/proner-labeled-text. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{annotation_type}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME}_{annotation_type} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{annotation_type}", + ) + for annotation_type in _ANNOTATION_TYPES + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{annotation_type}_l1_seacrowd_seq_label", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME}_{annotation_type}_l1 SEACrowd schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{annotation_type}_l1", + ) + for annotation_type in _ANNOTATION_TYPES + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{annotation_type}_l2_seacrowd_seq_label", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME}_{annotation_type}_l2 SEACrowd schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{annotation_type}_l2", + ) + for annotation_type in _ANNOTATION_TYPES + ] + ) + + label_classes = [ + "B-PRO", + "B-BRA", + "B-TYP", + "I-PRO", + "I-BRA", + "I-TYP", + "O", + ] + + def _extract_label(self, text: str, idx: int) -> str: + split = text.split("|") + if len(split) > 1 and idx != -1: + return split[idx] + else: + return text + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "ner_tags": datasets.Sequence(datasets.Value("string")), + } + ) + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(label_names=self.label_classes) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + annotation_type = self.config.subset_id.split("_")[2] + path = dl_manager.download_and_extract(_URLS[annotation_type]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": path, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + label_idx = -1 + subset_id = self.config.subset_id.split("_") + if len(subset_id) > 3: + if subset_id[3] in _ANNOTATION_IDXS: + label_idx = _ANNOTATION_IDXS[subset_id[3]] + + idx = 0 + conll_dataset = load_conll_data(filepath) + if self.config.schema == "source": + for _, row in enumerate(conll_dataset): + x = {"id": str(idx), "tokens": row["sentence"], "ner_tags": list(map(self._extract_label, row["label"], [label_idx] * len(row["label"])))} + yield idx, x + idx += 1 + elif self.config.schema == "seacrowd_seq_label": + for _, row in enumerate(conll_dataset): + x = {"id": str(idx), "tokens": row["sentence"], "labels": list(map(self._extract_label, row["label"], [label_idx] * len(row["label"])))} + yield idx, x + idx += 1 diff --git a/seacrowd/sea_datasets/indo_story_cloze/__init__.py b/seacrowd/sea_datasets/indo_story_cloze/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indo_story_cloze/indo_story_cloze.py b/seacrowd/sea_datasets/indo_story_cloze/indo_story_cloze.py new file mode 100644 index 000000000..e233fe44b --- /dev/null +++ b/seacrowd/sea_datasets/indo_story_cloze/indo_story_cloze.py @@ -0,0 +1,179 @@ +import csv +import random +import string +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproceedings{koto-etal-2022-cloze, + title = "Cloze Evaluation for Deeper Understanding of Commonsense Stories in {I}ndonesian", + author = "Koto, Fajri and + Baldwin, Timothy and + Lau, Jey Han", + editor = "Bosselut, Antoine and + Li, Xiang and + Lin, Bill Yuchen and + Shwartz, Vered and + Majumder, Bodhisattwa Prasad and + Lal, Yash Kumar and + Rudinger, Rachel and + Ren, Xiang and + Tandon, Niket and + Zouhar, Vil{\'e}m", + booktitle = "Proceedings of the First Workshop on Commonsense Representation and Reasoning (CSRR 2022)", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.csrr-1.2", + doi = "10.18653/v1/2022.csrr-1.2", + pages = "8--16", +} +""" + +_DATASETNAME = "indo_story_cloze" + +_DESCRIPTION = """ +A Story Cloze Test framework in Indonesian. A story in our dataset consists of four-sentence premise, one-sentence +correct ending, and one-sentence incorrect ending. In total, we have created 2,325 Indonesian stories with the +train/dev/test split 1,000/200/1,135. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/indolem/indo_story_cloze" + +_LANGUAGES = ["ind"] + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + "train": "https://huggingface.co/datasets/indolem/indo_story_cloze/resolve/main/train.csv", + "dev": "https://huggingface.co/datasets/indolem/indo_story_cloze/resolve/main/dev.csv", + "test": "https://huggingface.co/datasets/indolem/indo_story_cloze/resolve/main/test.csv", + }, +} + +_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class IndoStoryClozeDataset(datasets.GeneratorBasedBuilder): + """IndoStoryCloze is a Story Cloze dataset in Indonesian.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_qa", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "sentence-1": datasets.Value("string"), + "sentence-2": datasets.Value("string"), + "sentence-3": datasets.Value("string"), + "sentence-4": datasets.Value("string"), + "correct_ending": datasets.Value("string"), + "incorrect_ending": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_dir, "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_dir, "split": "test"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_dir, "split": "dev"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + if self.config.schema == "source": + data = csv.DictReader(open(filepath[split], newline="", encoding="utf-8")) + for i, row in enumerate(data): + yield i, { + "sentence-1": row["Kalimat-1"], + "sentence-2": row["Kalimat-2"], + "sentence-3": row["Kalimat-3"], + "sentence-4": row["Kalimat-4"], + "correct_ending": row["Correct Ending"], + "incorrect_ending": row["Incorrect Ending"], + } + + elif self.config.schema == "seacrowd_qa": + data = csv.DictReader(open(filepath[split], newline="", encoding="utf-8")) + + def build_question(line): + # Concatenate the 4 sentences, this can either be the question of the context. Set is as question for + # now. Some sentences do not have punctuation, hence adding . before concatenation. + sentences = [] + for k in ["Kalimat-1", "Kalimat-2", "Kalimat-3", "Kalimat-4"]: + if line[k].strip()[-1] not in string.punctuation: + sentences.append(line[k] + ".") + else: + sentences.append(line[k]) + return " ".join(sentences) + + for i, row in enumerate(data): + yield i, { + "id": str(i), + "question_id": str(i), + "document_id": str(i), + "question": build_question(row), + "type": "multiple_choice", + # Reorder choices based on the randomly generated labels, avoiding correct answer at the same order. + "choices": [row["Correct Ending"], row["Incorrect Ending"]] if random.randint(0, 1) == 0 else [row["Incorrect Ending"], row["Correct Ending"]], + "context": "", + "answer": [row["Correct Ending"]], + "meta": {}, + } diff --git a/seacrowd/sea_datasets/indocamrest/__init__.py b/seacrowd/sea_datasets/indocamrest/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indocamrest/indocamrest.py b/seacrowd/sea_datasets/indocamrest/indocamrest.py new file mode 100644 index 000000000..b2ed78b7d --- /dev/null +++ b/seacrowd/sea_datasets/indocamrest/indocamrest.py @@ -0,0 +1,163 @@ +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@article{kautsar2023indotod, + author={Kautsar, Muhammad Dehan Al and Nurdini, Rahmah Khoirussyifa' and Cahyawijaya, Samuel and Winata, Genta Indra and Purwarianti, Ayu}, + title={IndoToD: A Multi-Domain Indonesian Benchmark For End-to-End Task-Oriented Dialogue Systems}, + journal={arXiv preprint arXiv:2311.00958}, + year={2023}, +} +""" + +_LANGUAGES = ["ind"] +_LOCAL = False + +_DATASETNAME = "indocamrest" + +_DESCRIPTION = """\ +IndoCamRest is a synthetic task-oriented dialogue system dataset that translated from Cambridge Restaurant 676 (CamRest) dataset (Wen et al., 2016) into the new Indonesian parallel dataset using the translation pipeline method including the delexicalization, translation, and delexicalization. +The dataset consists of 676 dialogues in the restaurant reservation domain, with a user and an agent talking to each other to search the restaurant near the user. +It also consists of slots and dialogue acts from the user and the agent. +""" + +_HOMEPAGE = "https://github.com/dehanalkautsar/IndoToD/tree/main/IndoCamRest" + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/dehanalkautsar/IndoToD/main/IndoCamRest/IndoCamRest676.json", +} + +_SUPPORTED_TASKS = [Tasks.E2E_TASK_ORIENTED_DIALOGUE] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class IndoCamRest(datasets.GeneratorBasedBuilder): + """IndoToD: A Multi-Domain Indonesian Benchmark For End-to-End Task-Oriented Dialogue Systems""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="indocamrest_source", + version=SOURCE_VERSION, + description="IndoToD: IndoCamRest source schema", + schema="source", + subset_id="indocamrest", + ), + SEACrowdConfig( + name="indocamrest_seacrowd_tod", + version=SEACROWD_VERSION, + description="IndoToD: IndoCamRest SEACrowd End-to-end Task Oriented Dialogue schema", + schema="seacrowd_tod", + subset_id="indocamrest", + ), + ] + + DEFAULT_CONFIG_NAME = "indocamrest_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "index": datasets.Value("string"), + "dialogue_id": datasets.Value("int32"), + "finished": datasets.Value("string"), + "goal": {"constraints": [[datasets.Value("string")]], "request-slots": [datasets.Value("string")], "text": datasets.Value("string")}, + "dial": [ + { + "turn": datasets.Value("int32"), + "usr": { + "transcript": datasets.Value("string"), + "delex_transcript": datasets.Value("string"), + "slu": [{"act": datasets.Value("string"), "slots": [[datasets.Value("string")]]}], + }, + "sys": {"sent": datasets.Value("string"), "delex_sent": datasets.Value("string"), "DA": [datasets.Value("string")]}, + } + ], + } + ) + elif self.config.schema == "seacrowd_tod": + features = schemas.tod_features + else: + raise NotImplementedError(f"Schema {self.config.schema} has not been implemented") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + with open(filepath, "r+") as fw: + data = json.loads(fw.read()) + + if self.config.schema == "source": + for idx, example in enumerate(data): + example["index"] = str(idx) + yield str(idx), example + + elif self.config.schema == "seacrowd_tod": + for idx, tod_dialogue in enumerate(data): + example = {} + example["dialogue_idx"] = idx + + dialogue = [] + for i in range(len(tod_dialogue["dial"]) + 1): + dial = {} + dial["turn_idx"] = i + + # system_utterance properties + if i == 0: + # case if turn_idx == 0 + dial["system_utterance"] = "" + dial["system_acts"] = [] + else: + dial["system_utterance"] = tod_dialogue["dial"][i - 1]["sys"]["sent"] + # some system_acts is either to string or list of strings, + # converting all to list of strings + dial["system_acts"] = [[act] if isinstance(act, str) else act for act in tod_dialogue["dial"][i - 1]["sys"]["DA"]] + + # user_utterance properties + dial["turn_label"] = [] + dial["belief_state"] = [] + if i == len(tod_dialogue["dial"]): + # case if turn_idx > len(dialogue) --> add dummy user_utterance + dial["user_utterance"] = "" + else: + dial["user_utterance"] = tod_dialogue["dial"][i]["usr"]["transcript"] + for j in range(len(tod_dialogue["dial"][i]["usr"]["slu"])): + dial["belief_state"].append({"slots": tod_dialogue["dial"][i]["usr"]["slu"][j]["slots"], "act": tod_dialogue["dial"][i]["usr"]["slu"][j]["act"]}) + + # append to dialogue + dialogue.append(dial) + example["dialogue"] = dialogue + yield str(idx), example diff --git a/seacrowd/sea_datasets/indoler/__init__.py b/seacrowd/sea_datasets/indoler/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indoler/indoler.py b/seacrowd/sea_datasets/indoler/indoler.py new file mode 100644 index 000000000..b3768495b --- /dev/null +++ b/seacrowd/sea_datasets/indoler/indoler.py @@ -0,0 +1,214 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +https://github.com/ir-nlp-csui/indoler/tree/main +The dataset contains 993 annotated court decission document. +The document was taken from Decision of the Supreme Court of Indonesia. +The documents have also been tokenized and cleaned +""" +import os +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@INPROCEEDINGS{9263157, + author={Nuranti, Eka Qadri and Yulianti, Evi}, + booktitle={2020 International Conference on Advanced Computer Science and Information Systems (ICACSIS)}, + title={Legal Entity Recognition in Indonesian Court Decision Documents Using Bi-LSTM and CRF Approaches}, + year={2020}, + volume={}, + number={}, + pages={429-434}, + keywords={Xenon;6G mobile communication;legal processing;legal entity recognition;legal document;name entity recognition;ner;bi-lstm;lstm;crf}, + doi={10.1109/ICACSIS51025.2020.9263157}} +""" + +_DATASETNAME = "indoler" + +_DESCRIPTION = """\ +https://github.com/ir-nlp-csui/indoler/tree/main +The data can be used for NER Task in legal documents. +The dataset contains 993 annotated court decission document. +The document was taken from Decision of the Supreme Court of Indonesia. +The documents have also been tokenized and cleaned +""" + +_HOMEPAGE = "https://github.com/ir-nlp-csui/indoler/tree/main" + +_LANGUAGES = ['ind'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + "test_idx": "https://raw.githubusercontent.com/ir-nlp-csui/indoler/main/test.ids.csv", + "train_idx": "https://raw.githubusercontent.com/ir-nlp-csui/indoler/main/train.ids.csv", + "valid_idx": "https://raw.githubusercontent.com/ir-nlp-csui/indoler/main/val.ids.csv", + "full_data": "https://raw.githubusercontent.com/ir-nlp-csui/indoler/main/data.json" + }, +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "2.0.0" + +_SEACROWD_VERSION = "1.0.0" + + + +class IndoLer(datasets.GeneratorBasedBuilder): + """https://github.com/ir-nlp-csui/indoler/tree/main +The data can be used for NER Task in legal documents +The dataset contains 993 annotated court decission document. +The document was taken from Decision of the Supreme Court of Indonesia. +The documents have also been tokenized and cleaned""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="indoler_source", + version=SOURCE_VERSION, + description="indoler source schema", + schema="source", + subset_id="indoler", + ), + SEACrowdConfig( + name="indoler_seacrowd_seq_label", + version=SEACROWD_VERSION, + description="indoler SEACrowd schema", + schema="seacrowd_seq_label", + subset_id="indoler", + ), + ] + + DEFAULT_CONFIG_NAME = "indoler_source" + + def _info(self) -> datasets.DatasetInfo: + + NAMED_ENTITIES = ['O', 'B-Jenis Amar', 'B-Jenis Dakwaan', 'B-Jenis Perkara', 'B-Melanggar UU (Dakwaan)', + 'B-Melanggar UU (Pertimbangan Hukum)', 'B-Melanggar UU (Tuntutan)', 'B-Nama Hakim Anggota', 'B-Nama Hakim Ketua', + 'B-Nama Jaksa', 'B-Nama Panitera', 'B-Nama Pengacara', 'B-Nama Pengadilan', + 'B-Nama Saksi', 'B-Nama Terdakwa', 'B-Nomor Putusan', 'B-Putusan Hukuman', + 'B-Tanggal Kejadian', 'B-Tanggal Putusan', 'B-Tingkat Kasus', 'B-Tuntutan Hukuman', + 'I-Jenis Amar', 'I-Jenis Dakwaan', 'I-Jenis Perkara', 'I-Melanggar UU (Dakwaan)', + 'I-Melanggar UU (Pertimbangan Hukum)', 'I-Melanggar UU (Tuntutan)', 'I-Nama Hakim Anggota', 'I-Nama Hakim Ketua', + 'I-Nama Jaksa', 'I-Nama Panitera', 'I-Nama Pengacara', 'I-Nama Pengadilan', + 'I-Nama Saksi', 'I-Nama Terdakwa', 'I-Nomor Putusan', 'I-Putusan Hukuman', + 'I-Tanggal Kejadian', 'I-Tanggal Putusan', 'I-Tingkat Kasus', 'I-Tuntutan Hukuman'] + + if self.config.schema == "source": + features = datasets.Features({ + "id": datasets.Value("string"), + "owner": datasets.Value("string"), + "lawyer": datasets.ClassLabel(names=[False, True]), + "verdict": datasets.ClassLabel(names=["guilty", "bebas", "lepas"]), + "indictment": datasets.ClassLabel(names=["NA", "tunggal", "subsider", "komul", "alternatif", "kombinasi", "gabungan"]), + "text-tags": datasets.Sequence(datasets.ClassLabel(names=NAMED_ENTITIES)), + "text": datasets.Sequence(datasets.Value("string")), + }) + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label.features(NAMED_ENTITIES) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + test_path = dl_manager.download_and_extract(urls['test_idx']) + train_path = dl_manager.download_and_extract(urls['train_idx']) + valid_path = dl_manager.download_and_extract(urls['valid_idx']) + data_path = dl_manager.download_and_extract(urls['full_data']) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path, + "idx_path": train_path, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path, + "idx_path": test_path, + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_path, + "idx_path": valid_path, + "split": "validation", + }, + ), + ] + + def _generate_examples(self, filepath: Path, idx_path: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + split_idxs = [] + with open(idx_path, 'r', encoding="utf-8") as indexes: + for index in indexes.readlines(): + split_idxs.append(int(index)) + with open(filepath, 'r', encoding="utf-8") as file: + contents = json.load(file) + counter = 0 + for content in contents: + if int(content['id']) in split_idxs: + if self.config.schema == "source": + if content['indictment'] not in ["NA", "tunggal", "subsider", "komul", "alternatif", "kombinasi", "gabungan"]: + content['indictment'] = "NA" + yield( + counter, + { + "id" : content['id'], + "owner" : content['owner'], + "lawyer" : content['lawyer'], + "verdict" : content['verdict'], + "indictment": content['indictment'], + "text-tags" : content['text-tags'], + "text" : content['text'], + } + ) + counter += 1 + elif self.config.schema == "seacrowd_seq_label": + yield( + counter, + { + "id": content['id'], + "tokens": content['text'], + "labels": content['text-tags'], + } + ) + counter += 1 diff --git a/seacrowd/sea_datasets/indommlu/__init__.py b/seacrowd/sea_datasets/indommlu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indommlu/indommlu.py b/seacrowd/sea_datasets/indommlu/indommlu.py new file mode 100644 index 000000000..7d7959cff --- /dev/null +++ b/seacrowd/sea_datasets/indommlu/indommlu.py @@ -0,0 +1,291 @@ +import csv +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproceedings{koto-etal-2023-large, + title = "Large Language Models Only Pass Primary School Exams in {I}ndonesia: A Comprehensive Test on {I}ndo{MMLU}", + author = "Koto, Fajri and + Aisyah, Nurul and + Li, Haonan and + Baldwin, Timothy", + editor = "Bouamor, Houda and + Pino, Juan and + Bali, Kalika", + booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2023", + address = "Singapore", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.emnlp-main.760", + doi = "10.18653/v1/2023.emnlp-main.760", + pages = "12359--12374", +} +""" + +_DATASETNAME = "indommlu" + +_DESCRIPTION = """ +IndoMMLU is the first multi-task language understanding benchmark for Indonesian culture and languages, which consists +of questions from primary school to university entrance exams in Indonesia. By employing professional teachers, we +obtain 14,906 questions across 63 tasks and education levels, with 46% of the questions focusing on assessing +proficiency in the Indonesian language and knowledge of nine local languages and cultures in Indonesia. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/indolem/IndoMMLU" + +_LANGUAGES = ["ind", "ban", "mad", "nij", "sun", "jav", "mak", "bjn", "abl"] + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value + +_LOCAL = False + +_URLS = {_DATASETNAME: {"test": "https://huggingface.co/datasets/indolem/IndoMMLU/resolve/main/IndoMMLU.csv"}} + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +lang2subject = {"ind": "Bahasa Indonesia", "ban": "Bahasa Bali", "mad": "Bahasa Madura", "nij": "Bahasa Dayak Ngaju", "sun": "Bahasa Sunda", "jav": "Bahasa Jawa", "mak": "Bahasa Makassar", "bjn": "Bahasa Banjar", "abl": "Bahasa Lampung"} + +subject2english = { + "Sejarah": "History", + "Geografi": "Geography", + "Bahasa Lampung": "Lampungic", + "IPS": "Social science", + "Bahasa Bali": "Balinese", + "Bahasa Makassar": "Makassarese", + "Bahasa Banjar": "Banjarese", + "Kimia": "Chemistry", + "Biologi": "Biology", + "IPA": "Science", + "Agama Kristen": "Christian religion", + "Kesenian": "Art", + "Agama Islam": "Islam religion", + "Agama Hindu": "Hindu religion", + "Bahasa Madura": "Madurese", + "Penjaskes": "Sport", + "Bahasa Indonesia": "Indonesian language", + "Fisika": "Physics", + "Budaya Alam Minangkabau": "Minangkabau culture", + "Bahasa Dayak Ngaju": "Dayak language", + "Sosiologi": "Sociology", + "Ekonomi": "Economy", + "Bahasa Sunda": "Sundanese", + "Bahasa Jawa": "Javanese", + "PPKN": "Civic education", +} + +subject2group = { + "Sejarah": "Humanities", + "Geografi": "Social science", + "Bahasa Lampung": "Local languages and cultures", + "IPS": "Social science", + "Bahasa Bali": "Local languages and cultures", + "Bahasa Makassar": "Local languages and cultures", + "Bahasa Banjar": "Local languages and cultures", + "Kimia": "STEM", + "Biologi": "STEM", + "IPA": "STEM", + "Agama Kristen": "Humanities", + "Kesenian": "Humanities", + "Agama Islam": "Humanities", + "Agama Hindu": "Humanities", + "Bahasa Madura": "Local languages and cultures", + "Penjaskes": "Humanities", + "Bahasa Indonesia": "Indonesian language", + "Fisika": "STEM", + "Budaya Alam Minangkabau": "Local languages and cultures", + "Bahasa Dayak Ngaju": "Local languages and cultures", + "Sosiologi": "Social science", + "Ekonomi": "Social science", + "Bahasa Sunda": "Local languages and cultures", + "Bahasa Jawa": "Local languages and cultures", + "PPKN": "Social science", +} + +special_case = ["SD-SMP-SMA", "SD-SMP"] +level_mapper = { + "SMA": "SMA", # SMA --> high school level" + "Seleksi PTN": "University entrance test", + "SD": "SD", # SD --> elementary school level + "SMP": "SMP", # SMP --> junior high school level + "Kelas I SD": "SD", + "Kelas X SMA": "SMA", + "Kelas XI SMA": "SMA", + "Kelas XII SMA": "SMA", + "V SD": "SD", + "VI SD": "SD", + "VII SMP": "SMP", + "VIII SMP ": "SMP", + "IX SMP": "SMP", + "Kelas III SD": "SD", + "Kelas IV SD": "SD", + "Kelas II SD": "SD", +} + + +def fix_level(level, kelas): + # Fixing Level + if level in special_case: + kelas = float(kelas) + if kelas >= 1 and kelas <= 6: + level = "SD" + elif kelas >= 7 and kelas <= 9: + level = "SMP" + elif kelas >= 10: + level = "SMA" + else: + print(level) + fixed_level = level_mapper[level] + + # Fixing class + kelas = str(kelas) + if kelas.strip() in ["PTN", "2023-10-12 00:00:00"]: + fixed_kelas = 13 + elif kelas == "4,5,6": + fixed_kelas = 6 + else: + fixed_kelas = int(float(kelas.strip())) + + # sanity check over the level and kelas + return fixed_level, fixed_kelas + + +def pass_schema_filter(schema, row): + if schema == "source": + return True + lang = schema.split("_")[1] + if lang not in _LANGUAGES: # seacrowd_qa + return True + if lang == "ind": # contains "Bahasa Indonesia" and all other non-language subjects + return (lang2subject[lang] == row["subject"]) or (row["subject"] not in lang2subject.values()) + return lang2subject[lang] == row["subject"] + + +class IndoMMLUDataset(datasets.GeneratorBasedBuilder): + """IndoMMLU is the first multitask language understanding benchmark for Indonesian culture and languages.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_qa", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=_DATASETNAME, + ), + ] + for lang in _LANGUAGES: + lang_config = SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_seacrowd_qa", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {lang} SEACrowd schema", + schema=f"seacrowd_{lang}_qa", + subset_id=_DATASETNAME, + ) + BUILDER_CONFIGS.append(lang_config) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "subject": datasets.Value("string"), + "group": datasets.Value("string"), + "level": datasets.Value("string"), + "class": datasets.Value("string"), + "question": datasets.Value("string"), + "options": datasets.Value("string"), + "answer": datasets.Value("string"), + "is_for_fewshot": datasets.Value("string"), + } + ) + + else: + features = schemas.qa_features + features["meta"] = { + "subject": datasets.Value("string"), + "group": datasets.Value("string"), + "level": datasets.Value("string"), + "class": datasets.Value("string"), + "is_for_fewshot": datasets.Value("string"), + } + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_dir, "split": "test"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + data = csv.DictReader(open(filepath[split], newline="")) + print(self.config.schema) + for i, row in enumerate(data): + if pass_schema_filter(self.config.schema, row): + fixed_level, fixed_kelas = fix_level(row["level"], row["kelas"]) + # The choices are in the format of ["A. xxx", "B. xxx", ...], but answer is only with ["A"], replacing both with only the answer content + choices = row["jawaban"].split("\n") + answer_choice = row["kunci"] + # Find the corresponding choice in the choices. + # Skip the 2 datapoint (i = 4223, 14150) with invalid answer_choice. + corresponding_choice = next((choice for choice in choices if choice.startswith(answer_choice)), None) + if corresponding_choice is None: + continue + else: + if self.config.schema == "source": + yield i, { + "subject": subject2english[row["subject"]], + "group": subject2group[row["subject"]], + "level": fixed_level, + "class": fixed_kelas, + "question": row["soal"], + "options": [opt[2:].strip() for opt in choices], # remove A., B., ... in the options, + "answer": corresponding_choice[2:].strip(), # remove A., B., ... in the answer + "is_for_fewshot": row["is_for_fewshot"], + } + else: + yield i, { + "id": str(i), + "question_id": str(i), + "document_id": str(i), + "question": row["soal"], + "type": "multiple_choice", + "choices": [opt[2:].strip() for opt in choices], # remove A., B., ... in the options + "context": "", + "answer": [corresponding_choice[2:].strip()], # remove A., B., ... in the answer, + "meta": {"subject": subject2english[row["subject"]], "group": subject2group[row["subject"]], "level": fixed_level, "class": fixed_kelas, "is_for_fewshot": row["is_for_fewshot"]}, + } diff --git a/seacrowd/sea_datasets/indoner_tourism/__init__.py b/seacrowd/sea_datasets/indoner_tourism/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indoner_tourism/indoner_tourism.py b/seacrowd/sea_datasets/indoner_tourism/indoner_tourism.py new file mode 100644 index 000000000..e09b51363 --- /dev/null +++ b/seacrowd/sea_datasets/indoner_tourism/indoner_tourism.py @@ -0,0 +1,183 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""\ +This dataset is designed for named entity recognition (NER) tasks in the Bahasa Indonesia tourism domain. It contains labeled sequences of named entities, including locations, facilities, and tourism-related entities. The dataset is annotated with the following entity types: + + O (0) : Non-entity or other words not falling into the specified categories. + B-WIS (1): Beginning of a tourism-related entity. + I-WIS (2): Continuation of a tourism-related entity. + B-LOC (3): Beginning of a location entity. + I-LOC (4): Continuation of a location entity. + B-FAS (5): Beginning of a facility entity. + I-FAS (6): Continuation of a facility entity. +""" +import os +import pandas as pd +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@article{JLK, + author = {Ahmad Hidayatullah and Muhammad Fakhri Despawida Aulia Putra and Adityo Permana Wibowo and Kartika Rizqi Nastiti}, + title = { Named Entity Recognition on Tourist Destinations Reviews in the Indonesian Language}, + journal = {Jurnal Linguistik Komputasional}, + volume = {6}, + number = {1}, + year = {2023}, + keywords = {}, + abstract = {To find information about tourist destinations, tourists usually search the reviews about the destinations they want to visit. However, many studies made it hard for them to see the desired information. Named Entity Recognition (NER) is one of the techniques to detect entities in a text. The objective of this research was to make a NER model using BiLSTM to detect and evaluate entities on tourism destination reviews. This research used 2010 reviews of several tourism destinations in Indonesia and chunked them into 116.564 tokens of words. Those tokens were labeled according to their categories: the name of the tourism destination, locations, and facilities. If the tokens could not be classified according to the existing categories, the tokens would be labeled as O (outside). The model has been tested and gives 94,3% as the maximum average of F1-Score.}, + issn = {2621-9336}, pages = {30--35}, doi = {10.26418/jlk.v6i1.89}, + url = {https://inacl.id/journal/index.php/jlk/article/view/89} +} +""" + +_DATASETNAME = "indoner_tourism" + +_DESCRIPTION = """\ +This dataset is designed for named entity recognition (NER) tasks in the Bahasa Indonesia tourism domain. It contains labeled sequences of named entities, including locations, facilities, and tourism-related entities. The dataset is annotated with the following entity types: + + O (0) : Non-entity or other words not falling into the specified categories. + B-WIS (1): Beginning of a tourism-related entity. + I-WIS (2): Continuation of a tourism-related entity. + B-LOC (3): Beginning of a location entity. + I-LOC (4): Continuation of a location entity. + B-FAS (5): Beginning of a facility entity. + I-FAS (6): Continuation of a facility entity. +""" + +_HOMEPAGE = "https://github.com/fathanick/IndoNER-Tourism/tree/main" + +_LANGUAGES = ['ind'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.AFL_3_0.value + +_LOCAL = False + +_URL = "https://raw.githubusercontent.com/fathanick/IndoNER-Tourism/main/ner_data.tsv" + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class IndoNERTourismDataset(datasets.GeneratorBasedBuilder): + """\ +This dataset is designed for named entity recognition (NER) tasks in the Bahasa Indonesia tourism domain. It contains labeled sequences of named entities, including locations, facilities, and tourism-related entities. The dataset is annotated with the following entity types: + + O (0) : Non-entity or other words not falling into the specified categories. + B-WIS (1): Beginning of a tourism-related entity. + I-WIS (2): Continuation of a tourism-related entity. + B-LOC (3): Beginning of a location entity. + I-LOC (4): Continuation of a location entity. + B-FAS (5): Beginning of a facility entity. + I-FAS (6): Continuation of a facility entity. +""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description="indoner_tourism source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description="indoner_tourism SEACrowd schema", + schema="seacrowd_seq_label", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + 'tokens' : datasets.Sequence(datasets.Value("string")), + 'ner_tags': datasets.Sequence( + datasets.ClassLabel(names=["O", "B-WIS", "I-WIS", "B-LOC", "I-LOC", "B-FAS", "I-FAS"]) + ), + } + ) + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label.features(["O", "B-WIS", "I-WIS", "B-LOC", "I-LOC", "B-FAS", "I-FAS"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URL + path = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": path, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + tokens = [] + ner_tags = [] + counter = 0 + with open(filepath, encoding="utf-8") as file: + for line in file: + # End of Sentence met + if line.strip() == "": + if self.config.schema == "source": + yield counter, {'tokens': tokens, 'ner_tags': ner_tags} + counter += 1 + tokens = [] + ner_tags = [] + elif self.config.schema == "seacrowd_seq_label": + yield counter, {'id': counter, 'tokens': tokens, 'labels': ner_tags} + counter += 1 + tokens = [] + ner_tags = [] + # Process until End of Sentence met + elif len(line.split('\t')) == 2: + token, ner_tag = line.split('\t') + tokens.append(token.strip()) + if ner_tag not in ["O", "B-WIS", "I-WIS", "B-LOC", "I-LOC", "B-FAS", "I-FAS"]: + if ner_tag[0] in ["B", "I"]: + if any(tag in ner_tag for tag in ["WIS", "LOC", "FAS"]): + if '_' in ner_tag: + ner_tag = '-'.join(ner_tag.split('_')) + ner_tags.append(ner_tag.strip()) diff --git a/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/__init__.py b/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py b/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py new file mode 100644 index 000000000..2fcad46e6 --- /dev/null +++ b/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py @@ -0,0 +1,151 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import jsonlines +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{, + author = {supryzhu}, + title = {Indonesia-Chinese-MTRobustEval}, + journal = {None}, + volume = {None}, + year = {2023}, + url = {https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval}, + doi = {None}, + biburl = {None}, + bibsource = {None} +} +""" + + +_DATASETNAME = "indonesia_chinese_mtrobusteval" + +_DESCRIPTION = """\ +The dataset is curated for the purpose of evaluating the robustness of Neural Machine Translation (NMT) towards natural occuring noise +(typo, slang, code switching, etc.). The dataset is crawled from Twitter, then pre-processed to obtain sentences with noise. +The dataset consists of a thousand noisy sentences. The dataset is translated into Chinese manually as the benchmark for evaluating the robustness of NMT. +""" + +_HOMEPAGE = "https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval" + +_LANGUAGES = ["ind", "cmn"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + + +_LICENSE = Licenses.MIT.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval/raw/main/data/Indonesia-Chinese.xlsx", +} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class IndonesiaChineseMtRobustEval(datasets.GeneratorBasedBuilder): + """The dataset consists of a thousand noisy sentences. The dataset is translated into Chinese manually as the benchmark for evaluating the robustness of NMT.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description="indonesia_chinese_mtrobusteval source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=SEACROWD_VERSION, + description="indonesia_chinese_mtrobusteval SEACrowd schema", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "src": datasets.Value("string"), + "tgt": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + file_path = dl_manager.download(urls) + df = pd.read_excel(file_path) + src = df["Indonesia"].tolist() + tgt = df["Chinese"].tolist() + results = [] + for i, item in enumerate(src): + results.append({"id": str(i), "src": item, "tgt": tgt[i]}) + self._write_jsonl(file_path + ".jsonl", results) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": file_path + ".jsonl", + "split": "train", + }, + ) + ] + + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + if self.config.schema == "source": + i = 0 + with jsonlines.open(filepath) as f: + for each_data in f.iter(): + ex = { + "id": each_data["id"], + "src": each_data["src"], + "tgt": each_data["tgt"], + } + yield i, ex + i += 1 + + elif self.config.schema == "seacrowd_t2t": + i = 0 + with jsonlines.open(filepath) as f: + for each_data in f.iter(): + ex = {"id": each_data["id"], "text_1": each_data["src"], "text_2": each_data["tgt"], "text_1_name": "ind", "text_2_name": "cmn"} + yield i, ex + i += 1 + + def _write_jsonl(self, filepath, values): + with jsonlines.open(filepath, "w") as writer: + for line in values: + writer.write(line) + diff --git a/seacrowd/sea_datasets/indonesian_madurese_bible_translation/__init__.py b/seacrowd/sea_datasets/indonesian_madurese_bible_translation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py b/seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py new file mode 100644 index 000000000..5478c9b61 --- /dev/null +++ b/seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py @@ -0,0 +1,180 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The Madurese Parallel Corpus Dataset is created by scraping content from the online Bible, resulting in 30,013 Indonesian-Madurese sentences. +This corpus is distinct from a previous Madurese dataset, which was gathered from physical documents such as the Kamus Lengkap Bahasa Madura-Indonesia. +The proposed dataset provides bilingual sentences, allowing for comparisons between Indonesian and Madurese. It aims to supplement existing Madurese +corpora, enabling enhanced research and development focused on regional languages in Indonesia. Unlike the prior dataset that included information +like lemmas, pronunciation, linguistic descriptions, part of speech, loanwords, dialects, and various structures, this new corpus primarily focuses +on bilingual sentence pairs, potentially broadening the scope for linguistic studies and language technology advancements in the Madurese language. +""" +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import jsonlines + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{, + author = {Sulistyo, Danang Arbian and Wibawa, Aji Prasetya and Prasetya, Didik Dwi and Nafalski, Andrew}, + title = {Autogenerated Indonesian-Madurese Parallel Corpus Dataset Using Neural Machine Translation}, + journal = {Available at SSRN 4644430}, + volume = {}, + year = {2023}, + url = {https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4644430}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +_DATASETNAME = "indonesian_madurese_bible_translation" + +_DESCRIPTION = """\ +The Madurese Parallel Corpus Dataset is created by scraping content from the online Bible, resulting in 30,013 Indonesian-Madurese sentences. +This corpus is distinct from a previous Madurese dataset, which was gathered from physical documents such as the Kamus Lengkap Bahasa Madura-Indonesia. +The proposed dataset provides bilingual sentences, allowing for comparisons between Indonesian and Madurese. It aims to supplement existing Madurese +corpora, enabling enhanced research and development focused on regional languages in Indonesia. Unlike the prior dataset that included information +like lemmas, pronunciation, linguistic descriptions, part of speech, loanwords, dialects, and various structures, this new corpus primarily focuses +on bilingual sentence pairs, potentially broadening the scope for linguistic studies and language technology advancements in the Madurese language. +""" + +_HOMEPAGE = "https://data.mendeley.com/datasets/cgtg4bhrtf/3" +_LANGUAGES = ["ind", "mad"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LICENSE = Licenses.CC_BY_4_0.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value +_LOCAL = False +_URLS = { + _DATASETNAME: "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/cgtg4bhrtf-3.zip", +} +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] # example: [Tasks.TRANSLITERATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class IndonesianMadureseBibleTranslationDataset(datasets.GeneratorBasedBuilder): + """TODO: This corpus consists of more than 20,000 Indonesian - Madurese sentences.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = "indonesian_madurese_bible_translation_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "src": datasets.Value("string"), + "tgt": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + data_dir = os.path.join(data_dir, "Bahasa Madura Corpus Dataset/Indonesian-Madurese Corpus") + all_raw_path = [data_dir + "/" + item for item in os.listdir(data_dir)] + all_path = [] + id = 0 + for raw_path in all_raw_path: + if "txt" in raw_path: + all_path.append(raw_path) + all_data = [] + for path in all_path: + data = self._read_txt(path) + for line in data: + if line != "\n": + all_data.append({"src": line.split("\t")[0], "tgt": line.split("\t")[1], "id": id}) + id += 1 + self._write_jsonl(data_dir + "/train.jsonl", all_data) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.jsonl"), + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + if self.config.schema == "source": + i = 0 + with jsonlines.open(filepath) as f: + for each_data in f.iter(): + ex = { + "id": each_data["id"], + "src": each_data["src"], + "tgt": each_data["tgt"], + } + yield i, ex + i += 1 + + elif self.config.schema == "seacrowd_t2t": + i = 0 + with jsonlines.open(filepath) as f: + for each_data in f.iter(): + ex = {"id": each_data["id"], "text_1": each_data["src"].strip(), "text_2": each_data["tgt"].strip(), "text_1_name": "ind", "text_2_name": "mad"} + yield i, ex + i += 1 + + def _write_jsonl(self, filepath, values): + with jsonlines.open(filepath, "w") as writer: + for line in values: + writer.write(line) + + def _read_txt(self, filepath): + with open(filepath, "r") as f: + lines = f.readlines() + return lines diff --git a/seacrowd/sea_datasets/indonesian_news_dataset/__init__.py b/seacrowd/sea_datasets/indonesian_news_dataset/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indonesian_news_dataset/indonesian_news_dataset.py b/seacrowd/sea_datasets/indonesian_news_dataset/indonesian_news_dataset.py new file mode 100644 index 000000000..55d083783 --- /dev/null +++ b/seacrowd/sea_datasets/indonesian_news_dataset/indonesian_news_dataset.py @@ -0,0 +1,128 @@ +import pickle +from pathlib import Path +from typing import List + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@misc{andreaschandra2020, + author = {Chandra, Andreas}, + title = {Indonesian News Dataset}, + year = {2020}, + howpublished = {Online}, + url = {https://github.com/andreaschandra/indonesian-news}, + note = {Accessed: 2024-02-13}, +} +""" + +_DATASETNAME = "indonesian_news_dataset" + +_DESCRIPTION = """An imbalanced dataset to classify Indonesian News articles. +The dataset contains 5 class labels: bola, news, bisnis, tekno, and otomotif. +The dataset comprises of around 6k train and 2.5k test examples, with the more prevalent classes +(bola and news) having roughly 10x the number of train and test examples than the least prevalent class (otomotif). +""" + +_HOMEPAGE = "https://github.com/andreaschandra/indonesian-news" + +_LICENSE = Licenses.UNKNOWN.value + +_URLS = { + f"{_DATASETNAME}_train": "https://drive.usercontent.google.com/u/0/uc?id=1wCwPMKSyTciv8I3g9xGdUfEraA1SydG6&export=download", + f"{_DATASETNAME}_test": "https://drive.usercontent.google.com/u/0/uc?id=1AFW_5KQFW86jlFO16S9bt564Y86WoJjV&export=download", +} + +_SUPPORTED_TASKS = [Tasks.TOPIC_MODELING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_TAGS = ["bola", "news", "bisnis", "tekno", "otomotif"] + + +class IndonesianNewsDataset(datasets.GeneratorBasedBuilder): + """The dataset contains 5 Indonesian News articles with imbalanced classes""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"index": datasets.Value("string"), "news": datasets.Value("string"), "label": datasets.Value("string")}) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(_TAGS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + train_dir = Path(dl_manager.download(_URLS[f"{_DATASETNAME}_train"])) + test_dir = Path(dl_manager.download(_URLS[f"{_DATASETNAME}_test"])) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_dir, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": test_dir, + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str): + """Yields examples as (key, example) tuples.""" + + with open(filepath, "rb") as file: + news_file = pickle.load(file) + + news_list = news_file[0] + label_list = news_file[1] + + if self.config.schema == "source": + for idx, (news, label) in enumerate(zip(news_list, label_list)): + example = {"index": str(idx), "news": news, "label": label} + yield idx, example + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for idx, (news, label) in enumerate(zip(news_list, label_list)): + example = {"id": str(idx), "text": news, "label": label} + yield idx, example + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/indonesiannmt/__init__.py b/seacrowd/sea_datasets/indonesiannmt/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indonesiannmt/indonesiannmt.py b/seacrowd/sea_datasets/indonesiannmt/indonesiannmt.py new file mode 100644 index 000000000..97e9c3afb --- /dev/null +++ b/seacrowd/sea_datasets/indonesiannmt/indonesiannmt.py @@ -0,0 +1,216 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The dataset is split into two: +1. Monolingual (ends with .txt) [Indonesian, Javanese] +2. Bilingual (ends with .tsv) [Indonesian-Javanese, Indonesian-Balinese, Indonesian-Minangkabau, Indonesian-Sundanese] +""" +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@misc{susanto2023replicable, + title={Replicable Benchmarking of Neural Machine Translation (NMT) on Low-Resource Local Languages in Indonesia}, + author={Lucky Susanto and Ryandito Diandaru and Adila Krisnadhi and Ayu Purwarianti and Derry Wijaya}, + year={2023}, + eprint={2311.00998}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" +_DATASETNAME = "indonesiannmt" + +_DESCRIPTION = """\ +This dataset is used on the paper "Replicable Benchmarking of Neural Machine Translation (NMT) on Low-Resource Local Languages in Indonesia". This repository contains two types of data: +1. Monolingual (*.txt) [Indonesian, Javanese] +2. Bilingual (*.tsv) [Indonesian-Javanese, Indonesian-Balinese, Indonesian-Minangkabau, Indonesian-Sundanese] +Only the Bilingual dataset is available for this dataloader +""" + +_HOMEPAGE = "https://huggingface.co/datasets/Exqrch/IndonesianNMT" + +_LANGUAGES = ["ind", "jav", "ban", "min", "sun"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value + +_LOCAL = False + +_URLS = { + "ind_jav": "https://huggingface.co/datasets/Exqrch/IndonesianNMT/resolve/main/id-jv.tsv?download=true", + "ind_sun": "https://huggingface.co/datasets/Exqrch/IndonesianNMT/resolve/main/id-su.tsv?download=true", + "ind_ban": "https://huggingface.co/datasets/Exqrch/IndonesianNMT/resolve/main/id-ban.tsv?download=true", + "ind_min": "https://huggingface.co/datasets/Exqrch/IndonesianNMT/resolve/main/id-min.tsv?download=true", + "ind": "https://huggingface.co/datasets/Exqrch/IndonesianNMT/resolve/main/bt-id-jv.id.txt?download=true", + "jav": "https://huggingface.co/datasets/Exqrch/IndonesianNMT/resolve/main/bt-id-jv.jv.txt?download=true", +} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION, Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +def seacrowd_config_constructor(modifier, schema, version): + return SEACrowdConfig( + name=f"indonesiannmt_{modifier}_{schema}", + version=version, + description=f"indonesiannmt_{modifier} {schema} schema", + schema=f"{schema}", + subset_id="indonesiannmt", + ) + + +class IndonesianNMT(datasets.GeneratorBasedBuilder): + """IndonesianNMT consists of 4 parallel datasets and 2 monolingual datasets, + all obtained synthetically from either gpt-3.5-turbo or text-davinci-003""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + [seacrowd_config_constructor(x, "source", _SOURCE_VERSION) for x in ["ind", "jav"]] + + [seacrowd_config_constructor(x, "seacrowd_ssp", _SOURCE_VERSION) for x in ["ind", "jav"]] + + [seacrowd_config_constructor(x, "source", _SOURCE_VERSION) for x in ["ind_jav", "ind_min", "ind_sun", "ind_ban"]] + + [seacrowd_config_constructor(x, "seacrowd_t2t", _SEACROWD_VERSION) for x in ["ind_jav", "ind_min", "ind_sun", "ind_ban"]] + ) + + DEFAULT_CONFIG_NAME = "indonesiannmt_ind_source" + + def is_mono(self): + if self.config.schema == "seacrowd_ssp": + return True + if "source" in self.config.schema: + if len(self.config.name.split("_")) == 3: + return True + return False + + def _info(self) -> datasets.DatasetInfo: + # ex mono: indonesiannmt_ind_source OR indonesiannmt_ind_seacrowd_ssp + # ex para: indonesiannmt_ind_jav_source OR indonesiannmt_ind_jav_seacrowd_t2t + is_mono = self.is_mono() + if is_mono and self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "lang_1": datasets.Value("string"), + "lang_2": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text_to_text.features + elif self.config.schema == "seacrowd_ssp": + features = schemas.self_supervised_pretraining.features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # ex mono: indonesiannmt_ind_source OR indonesiannmt_ind_seacrowd_ssp + # ex para: indonesiannmt_ind_jav_source OR indonesiannmt_ind_jav_seacrowd_t2t + is_mono = self.is_mono() + if "seacrowd_ssp" in self.config.schema or is_mono: + lang = self.config.name.split("_")[1] + path = dl_manager.download_and_extract(_URLS[lang]) + else: + target = "_".join(self.config.name.split("_")[1:3]) + url = _URLS[target] + path = dl_manager.download_and_extract(url) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": path, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + is_mono = self.is_mono() + STR_TO_ISO = {"Indonesian": "ind", "Javanese": "jav", "Minangkabau": "min", "Sundanese": "sun", "Balinese": "ban"} + + with open(filepath, encoding="utf-8") as f: + flag = True + if "seacrowd_ssp" in self.config.schema or is_mono: + for counter, row in enumerate(f): + if row.strip != "": + yield ( + counter, + { + "id": str(counter), + "text": row.strip(), + }, + ) + elif self.config.schema == "source": + for counter, row in enumerate(f): + if flag: + src, tgt = row.split("\t") + tgt = tgt.strip() + flag = False + else: + if row.strip() != "": + yield ( + counter, + { + "id": str(counter), + "text_1": row.split("\t")[0].strip(), + "text_2": row.split("\t")[1].strip(), + "lang_1": STR_TO_ISO[src], + "lang_2": STR_TO_ISO[tgt], + }, + ) + elif self.config.schema == "seacrowd_t2t": + for counter, row in enumerate(f): + if flag: + src, tgt = row.split("\t") + tgt = tgt.strip() + flag = False + else: + if row.strip() != "": + yield ( + counter, + { + "id": str(counter), + "text_1": row.split("\t")[0].strip(), + "text_2": row.split("\t")[1].strip(), + "text_1_name": STR_TO_ISO[src], + "text_2_name": STR_TO_ISO[tgt], + }, + ) diff --git a/seacrowd/sea_datasets/indonglish/__init__.py b/seacrowd/sea_datasets/indonglish/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indonglish/indonglish.py b/seacrowd/sea_datasets/indonglish/indonglish.py new file mode 100644 index 000000000..a6707be07 --- /dev/null +++ b/seacrowd/sea_datasets/indonglish/indonglish.py @@ -0,0 +1,216 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{Astuti2023, +title = {Code-Mixed Sentiment Analysis using Transformer for Twitter Social Media Data}, +journal = {International Journal of Advanced Computer Science and Applications}, +doi = {10.14569/IJACSA.2023.0141053}, +url = {http://dx.doi.org/10.14569/IJACSA.2023.0141053}, +year = {2023}, +publisher = {The Science and Information Organization}, +volume = {14}, +number = {10}, +author = {Laksmita Widya Astuti and Yunita Sari and Suprapto} +} +""" + +_DATASETNAME = "indonglish" +_DESCRIPTION = """\ +Indonglish-dataset was constructed based on keywords derived from the +sociolinguistic phenomenon observed among teenagers in South Jakarta. The +dataset was designed to tackle the semantic task of sentiment analysis, +incorporating three distinct label categories: positive, negative, and +neutral. The annotation of the dataset was carried out by a panel of five +annotators, each possessing expertise language and data science. +""" + +_HOMEPAGE = "https://github.com/laksmitawidya/indonglish-dataset" +_LANGUAGES = ["ind"] +_LICENSE = Licenses.UNKNOWN.value +_LOCAL = False + +_URLS = { + "skenario-orig": { + "train": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario-ori/train.csv", + "validation": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario-ori/validation.csv", + "test": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario-ori/test.csv", + }, + "skenario1": { + "train": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario1/training.csv", + "validation": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario1/validation.csv", + "test": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario1/test.csv", + }, + "skenario2": { + "train": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario2/training.csv", + "validation": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario2/validation.csv", + "test": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario2/test.csv", + }, + "skenario3": { + "train": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario3/training.csv", + "validation": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario3/validation.csv", + "test": "https://raw.githubusercontent.com/laksmitawidya/indonglish-dataset/master/skenario3/test.csv", + }, +} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class Indonglish(datasets.GeneratorBasedBuilder): + """Indonglish dataset for sentiment analysis from https://github.com/laksmitawidya/indonglish-dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "text" + _LABELS = ["Positif", "Negatif", "Netral"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + for i in range(1, 4): + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_skenario{i}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_skenario{i}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_skenario{i}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_skenario{i}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "tweet": datasets.Value("string"), + "label": datasets.ClassLabel(names=self._LABELS), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(self._LABELS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + if "skenario" in self.config.name: + setting = self.config.name.split("_")[1] + else: + setting = "skenario-orig" + + data_paths = { + setting: { + "train": Path(dl_manager.download_and_extract(_URLS[setting]["train"])), + "validation": Path(dl_manager.download_and_extract(_URLS[setting]["validation"])), + "test": Path(dl_manager.download_and_extract(_URLS[setting]["test"])), + } + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_paths[setting]["train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_paths[setting]["test"], + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_paths[setting]["validation"], + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + # read csv file + with open(filepath, "r", encoding="utf-8") as csv_file: + csv_reader = csv.reader(csv_file) + csv_data = [row for row in csv_reader] + csv_data = csv_data[1:] # remove header + + num_sample = len(csv_data) + + for i in range(num_sample): + if self.config.schema == "source": + example = { + "id": str(i), + "tweet": csv_data[i][0], + "label": csv_data[i][1], + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = { + "id": str(i), + "text": csv_data[i][0], + "label": csv_data[i][1], + } + + yield i, example diff --git a/seacrowd/sea_datasets/indoqa/__init__.py b/seacrowd/sea_datasets/indoqa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indoqa/indoqa.py b/seacrowd/sea_datasets/indoqa/indoqa.py new file mode 100644 index 000000000..5411a4aee --- /dev/null +++ b/seacrowd/sea_datasets/indoqa/indoqa.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@misc{IndoQA, + author = {{Jakarta Artificial Intelligence Research}} + title = {IndoQA: Building Indonesian QA dataset}, + year = {2023} + url = {https://huggingface.co/datasets/jakartaresearch/indoqa} +} +""" + +_DATASETNAME = "indoqa" + +_DESCRIPTION = """\ +IndoQA is a monolingual question-answering dataset of Indonesian language (ind). +It comprises 4,413 examples with 3:1 split of training and validation sets. +The datasets consists of a context paragraph along with an associated question-answer pair. +""" + +_HOMEPAGE = "https://jakartaresearch.com/" +_LICENSE = Licenses.CC_BY_ND_4_0.value + +_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LOCAL = False +_URLS = { + _DATASETNAME: { + "train": "https://drive.google.com/uc?id=1ND893H5x2gaPRRMJVajQ4hgqpopHoD0u", + "validation": "https://drive.google.com/uc?id=1mq_foV72riXb1KVBirJzTFZEe7oa8f4f", + }, +} + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class IndoQADataset(datasets.GeneratorBasedBuilder): + """IndoQA: A monolingual Indonesian question-answering dataset comprises 4,413 instances of QA-pair with context.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_qa", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "question": datasets.Value("string"), + "answer": datasets.Value("string"), + "context": datasets.Value("string"), + "category": datasets.Value("string"), + "span_start": datasets.Value("int32"), + "span_end": datasets.Value("int32"), + } + ) + + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + features["meta"]["span_start"] = datasets.Value("int32") + features["meta"]["span_end"] = datasets.Value("int32") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_paths = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_paths["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_paths["validation"]}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(filepath, "r", encoding="utf-8") as file: + datas = json.load(file) + + if self.config.schema == "source": + for key, data in enumerate(datas): + yield key, data + + elif self.config.schema == "seacrowd_qa": + for key, data in enumerate(datas): + yield key, { + "id": f'{data["id"]}', + "question_id": data["id"], + "document_id": "", + "question": data["question"], + "type": data["category"], + "choices": [], + "context": data["context"], + "answer": [data["answer"]], + "meta": { + "span_start": data["span_start"], + "span_end": data["span_end"], + }, + } diff --git a/seacrowd/sea_datasets/indosmd/__init__.py b/seacrowd/sea_datasets/indosmd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indosmd/indosmd.py b/seacrowd/sea_datasets/indosmd/indosmd.py new file mode 100644 index 000000000..c78f542d3 --- /dev/null +++ b/seacrowd/sea_datasets/indosmd/indosmd.py @@ -0,0 +1,273 @@ +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@article{kautsar2023indotod, + author={Kautsar, Muhammad Dehan Al and Nurdini, Rahmah Khoirussyifa' and Cahyawijaya, Samuel and Winata, Genta Indra and Purwarianti, Ayu}, + title={IndoToD: A Multi-Domain Indonesian Benchmark For End-to-End Task-Oriented Dialogue Systems}, + journal={arXiv preprint arXiv:2311.00958}, + year={2023}, +} +""" + +_LANGUAGES = ["ind"] +_LOCAL = False + +_DATASETNAME = "indosmd" + +_DESCRIPTION = """\ +IndoSMD is a synthetic task-oriented dialogue system dataset that was translated from the In-Car Assistant (SMD) dataset (Eric et al., 2017) into the new Indonesian dataset using the translation pipeline method +including delexicalization, translation, and delexicalization. The dataset consists of 323 dialogues in the POI Navigation, Calendar Scheduling, and Weather Information Retrieval domain, with a user and an agent talking to each other. +It also consists of slots and dialogue acts from the user and the agent. +""" + +_HOMEPAGE = "https://github.com/dehanalkautsar/IndoToD/tree/main/IndoSMD" + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_URLS = { + _DATASETNAME: { + "train": "https://raw.githubusercontent.com/dehanalkautsar/IndoToD/main/IndoSMD/IndoSMD_split/IndoSMD_train.json", + "validation": "https://raw.githubusercontent.com/dehanalkautsar/IndoToD/main/IndoSMD/IndoSMD_split/IndoSMD_dev.json", + "test": "https://raw.githubusercontent.com/dehanalkautsar/IndoToD/main/IndoSMD/IndoSMD_split/IndoSMD_test.json", + }, +} + +_SUPPORTED_TASKS = [Tasks.E2E_TASK_ORIENTED_DIALOGUE] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class IndoSMDDataset(datasets.GeneratorBasedBuilder): + """IndoToD: A Multi-Domain Indonesian Benchmark For End-to-End Task-Oriented Dialogue Systems""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description="IndoToD: IndoSMD source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_tod", + version=SEACROWD_VERSION, + description="IndoToD: IndoSMD SEACrowd End-to-end Task Oriented Dialogue schema", + schema="seacrowd_tod", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = "indosmd_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "index": datasets.Value("string"), + "dialogue": [ + { + "turn": datasets.Value("string"), + "data": { + "end_dialogue": datasets.Value("string"), + "utterance": datasets.Value("string"), + "delex_utterance": datasets.Value("string"), + "requested": { + "distance": datasets.Value("string"), + "traffic_info": datasets.Value("string"), + "poi_type": datasets.Value("string"), + "address": datasets.Value("string"), + "poi": datasets.Value("string"), + "room": datasets.Value("string"), + "agenda": datasets.Value("string"), + "time": datasets.Value("string"), + "date": datasets.Value("string"), + "party": datasets.Value("string"), + "event": datasets.Value("string"), + "weather_attribute": datasets.Value("string"), + "location": datasets.Value("string"), + }, + "slots": { + "distance": datasets.Value("string"), + "traffic_info": datasets.Value("string"), + "poi_type": datasets.Value("string"), + "address": datasets.Value("string"), + "poi": datasets.Value("string"), + "room": datasets.Value("string"), + "agenda": datasets.Value("string"), + "time": datasets.Value("string"), + "date": datasets.Value("string"), + "party": datasets.Value("string"), + "event": datasets.Value("string"), + "weather_attribute": datasets.Value("string"), + "location": datasets.Value("string"), + }, + }, + } + ], + "scenario": { + "kb": { + "items": [ + { + "distance": datasets.Value("string"), + "traffic_info": datasets.Value("string"), + "poi_type": datasets.Value("string"), + "address": datasets.Value("string"), + "poi": datasets.Value("string"), + "room": datasets.Value("string"), + "agenda": datasets.Value("string"), + "time": datasets.Value("string"), + "date": datasets.Value("string"), + "party": datasets.Value("string"), + "event": datasets.Value("string"), + "monday": datasets.Value("string"), + "tuesday": datasets.Value("string"), + "wednesday": datasets.Value("string"), + "thursday": datasets.Value("string"), + "friday": datasets.Value("string"), + "saturday": datasets.Value("string"), + "sunday": datasets.Value("string"), + "today": datasets.Value("string"), + "location": datasets.Value("string"), + } + ], + "column_names": [datasets.Value("string")], + "kb_title": datasets.Value("string"), + }, + "task": {"intent": datasets.Value("string")}, + "uuid": datasets.Value("string"), + }, + } + ) + elif self.config.schema == "seacrowd_tod": + features = schemas.tod_features + else: + raise NotImplementedError(f"Schema {self.config.schema} has not been implemented") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir["train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir["validation"], + "split": "validation", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir["test"], + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + key_slot_constant = ["distance", "traffic_info", "poi_type", "address", "poi", "room", "agenda", "time", "date", "party", "event", "weather_attribute", "location"] + key_kb_constant = ["distance", "traffic_info", "poi_type", "address", "poi", "room", "agenda", "time", "date", "party", "event", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "today", "location"] + + with open(filepath, "r+") as fw: + data = json.loads(fw.read()) + + if self.config.schema == "source": + for idx, example in enumerate(data): + example["index"] = str(idx) + for i in range(len(example["dialogue"])): + if "requested" not in example["dialogue"][i]["data"]: # the difference between user and system utterance (user and system utterance is divided into each dict in the origin dataset) + example["dialogue"][i]["data"]["requested"] = {} + example["dialogue"][i]["data"]["slots"] = {} + for key in key_slot_constant: + example["dialogue"][i]["data"]["requested"][key] = "" + example["dialogue"][i]["data"]["slots"][key] = "" + else: + for key in key_slot_constant: + if key not in example["dialogue"][i]["data"]["requested"]: + example["dialogue"][i]["data"]["requested"][key] = "" + if key not in example["dialogue"][i]["data"]["slots"]: + example["dialogue"][i]["data"]["slots"][key] = "" + + if not example["scenario"]["kb"].get("items"): + example["scenario"]["kb"]["items"] = [] + + for i in range(len(example["scenario"]["kb"]["items"])): + for key in key_kb_constant: + if key not in example["scenario"]["kb"]["items"][i]: + example["scenario"]["kb"]["items"][i][key] = "" + + yield str(idx), example + + elif self.config.schema == "seacrowd_tod": + for idx, tod_dialogue in enumerate(data): + example = {} + example["dialogue_idx"] = idx + + dialogue = [] + # NOTE: the dialogue always started with `driver` as first utterance + for turn, i in enumerate(range(0, len(tod_dialogue["dialogue"]) + 2, 2)): + dial = {} + dial["turn_idx"] = turn + + # system_utterance properties + dial["system_utterance"] = "" + dial["system_acts"] = [] + if turn != 0: + dial["system_utterance"] = tod_dialogue["dialogue"][i - 1]["data"]["utterance"] + if i < len(tod_dialogue["dialogue"]): + # NOTE: system_acts will be filled with every slot that has 'True' value on the origin dataset (on the requested field) + for act in tod_dialogue["dialogue"][i + 1]["data"]["requested"]: + if tod_dialogue["dialogue"][i + 1]["data"]["requested"][act]: + dial["system_acts"].append([act]) + + # user_utterance properties + dial["turn_label"] = [] + dial["belief_state"] = [] + if i == len(tod_dialogue["dialogue"]): + # case if turn_idx > len(dialogue) --> add dummy user_utterance + dial["user_utterance"] = "" + else: + dial["user_utterance"] = tod_dialogue["dialogue"][i]["data"]["utterance"] + # NOTE: belief_state will be filled with request act from `requested` field & inform act from `slots` field in the origin dataset + for act in tod_dialogue["dialogue"][i + 1]["data"]["requested"]: + if tod_dialogue["dialogue"][i + 1]["data"]["requested"][act]: + dial["belief_state"].append({"slots": [["slot", act]], "act": "request"}) + for slot, slot_value in tod_dialogue["dialogue"][i + 1]["data"]["slots"].items(): + dial["belief_state"].append({"slots": [[slot, slot_value]], "act": "inform"}) + + # append to dialogue + dialogue.append(dial) + example["dialogue"] = dialogue + yield str(idx), example diff --git a/seacrowd/sea_datasets/indowiki/__init__.py b/seacrowd/sea_datasets/indowiki/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indowiki/indowiki.py b/seacrowd/sea_datasets/indowiki/indowiki.py new file mode 100644 index 000000000..22403f0ed --- /dev/null +++ b/seacrowd/sea_datasets/indowiki/indowiki.py @@ -0,0 +1,198 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses + +_CITATION = """\ +@INPROCEEDINGS{ramli2022indokepler, + author={Ramli, Inigo and Krisnadhi, Adila Alfa and Prasojo, Radityo Eko}, + booktitle={2022 7th International Workshop on Big Data and Information Security (IWBIS)}, + title={IndoKEPLER, IndoWiki, and IndoLAMA: A Knowledge-enhanced Language Model, Dataset, and Benchmark for the Indonesian Language}, + year={2022}, + volume={}, + number={}, + pages={19-26}, + doi={10.1109/IWBIS56557.2022.9924844}} +""" + +_DATASETNAME = "indowiki" +_DESCRIPTION = """\ +IndoWiki is a knowledge-graph dataset taken from WikiData and aligned with Wikipedia Bahasa Indonesia as it's corpus. +""" +_HOMEPAGE = "https://github.com/IgoRamli/IndoWiki" +_LANGUAGES = ["ind"] +_LICENSE = Licenses.MIT.value +_LOCAL = False + +_URLS = { + "inductive": { + "train": "https://drive.google.com/uc?export=download&id=1S3vNx9By5CWKGkObjtXaI6Jr4xri2Tz3", + "valid": "https://drive.google.com/uc?export=download&id=1cP-zDIxp9a-Bw9uYd40K9IN-4wg4dOgy", + "test": "https://drive.google.com/uc?export=download&id=1pLcoJgYmgQiN4Gv9tRcI26zM7-OgHcuZ", + }, + "transductive": { + "train": "https://drive.google.com/uc?export=download&id=1KXDVwboo1h2yk_kAqv7IPYnHXCK6g-6X", + "valid": "https://drive.google.com/uc?export=download&id=1eRwpuRPYOnA-7FZ-YNZjRJ2DHuJsfUIE", + "test": "https://drive.google.com/uc?export=download&id=1cy9FwDMB_U-js8P8u4IWolvNeIFkQVDh", + }, + "text": "https://drive.usercontent.google.com/download?id=1YC4P_IPSo1AsEwm5Z_4GBjDdwCbvokxX&export=download&authuser=0&confirm=t&uuid=36aa95f5-e1b6-43c1-a34f-754d14d8b473&at=APZUnTWD7fwarBs4ZVRy_QdKbDXi%3A1709478240158", +} + +# none of the tasks in schema +# dataset is used to learn knowledge embedding +_SUPPORTED_TASKS = [] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class IndoWiki(datasets.GeneratorBasedBuilder): + """IndoWiki knowledge base dataset from https://github.com/IgoRamli/IndoWiki""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + # inductive setting + SEACrowdConfig( + name=f"{_DATASETNAME}_inductive_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + # transductive setting + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "ent1": datasets.Value("string"), + "ent2": datasets.Value("string"), + "ent1_text": datasets.Value("string"), + "ent2_text": datasets.Value("string"), + "relation": datasets.Value("string"), + } + ) + + else: + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + if "inductive" in self.config.name: + setting = "inductive" + data_paths = { + "inductive": { + "train": Path(dl_manager.download_and_extract(_URLS["inductive"]["train"])), + "valid": Path(dl_manager.download_and_extract(_URLS["inductive"]["valid"])), + "test": Path(dl_manager.download_and_extract(_URLS["inductive"]["test"])), + }, + "text": Path(dl_manager.download_and_extract(_URLS["text"])), + } + else: + setting = "transductive" + data_paths = { + "transductive": { + "train": Path(dl_manager.download_and_extract(_URLS["transductive"]["train"])), + "valid": Path(dl_manager.download_and_extract(_URLS["transductive"]["valid"])), + "test": Path(dl_manager.download_and_extract(_URLS["transductive"]["test"])), + }, + "text": Path(dl_manager.download_and_extract(_URLS["text"])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "triplets_filepath": data_paths[setting]["train"], + "text_filepath": data_paths["text"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "triplets_filepath": data_paths[setting]["test"], + "text_filepath": data_paths["text"], + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "triplets_filepath": data_paths[setting]["valid"], + "text_filepath": data_paths["text"], + "split": "dev", + }, + ), + ] + + def _generate_examples(self, triplets_filepath: Path, text_filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + # read triplets file + with open(triplets_filepath, "r", encoding="utf-8") as triplets_file: + triplets_data = triplets_file.readlines() + triplets_data = [s.strip("\n").split("\t") for s in triplets_data] + + # read text description file + with open(text_filepath, "r", encoding="utf-8") as text_file: + text_data = text_file.readlines() + # dictionary of entity: text description of entity + text_dict = {s.split("\t")[0]: s.split("\t")[1].strip("\n") for s in text_data} + + num_sample = len(triplets_data) + + for i in range(num_sample): + if self.config.schema == "source": + example = { + "id": str(i), + "ent1": triplets_data[i][0], + "ent2": triplets_data[i][2], + "ent1_text": text_dict[triplets_data[i][0]], + "ent2_text": text_dict[triplets_data[i][2]], + "relation": triplets_data[i][1], + } + + yield i, example diff --git a/seacrowd/sea_datasets/kawat/__init__.py b/seacrowd/sea_datasets/kawat/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/kde4/__init__.py b/seacrowd/sea_datasets/kde4/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/kde4/kde4.py b/seacrowd/sea_datasets/kde4/kde4.py new file mode 100644 index 000000000..10957fae1 --- /dev/null +++ b/seacrowd/sea_datasets/kde4/kde4.py @@ -0,0 +1,574 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +A parallel corpus of KDE4 localization files. The corpus is available in 92 languages in total, with 4099 bitexts. +""" +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{tiedemann2012parallel, + title={Parallel Data, Tools and Interfaces in OPUS}, + author={Tiedemann, J{\"o}rg}, + booktitle={Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12)}, + pages={2214--2218}, + year={2012} +} +""" + +_DATASETNAME = "kde4" + +_DESCRIPTION = """\ +A parallel corpus of KDE4 localization files. The corpus is available in 92 languages in total, with 4099 bitexts. +""" + +_HOMEPAGE = "https://opus.nlpl.eu/KDE4/corpus/version/KDE4" + +_LANGUAGES = ["ind", "khm", "zlm", "tha", "vie"] + +_LICENSE = Licenses.UNKNOWN.value +_LOCAL = False + +_URL_TEMPLATE = "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/{src}-{tgt}.txt.zip" + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] +_SOURCE_VERSION = "2.0.0" +_SEACROWD_VERSION = "1.0.0" + +kde4_language_codes = { + "aar": "aa", + "abk": "ab", + "ave": "ae", + "afr": "af", + "aka": "ak", + "amh": "am", + "arg": "an", + "ara": "ar", + "asm": "as", + "ava": "av", + "aym": "ay", + "aze": "az", + "bak": "ba", + "bel": "be", + "bul": "bg", + "bis": "bi", + "bam": "bm", + "ben": "bn", + "bod": "bo", + "bre": "br", + "bos": "bs", + "cat": "ca", + "che": "ce", + "cha": "ch", + "cos": "co", + "cre": "cr", + "ces": "cs", + "chu": "cu", + "chv": "cv", + "cym": "cy", + "dan": "da", + "deu": "de", + "div": "dv", + "dzo": "dz", + "ewe": "ee", + "ell": "el", + "eng": "en", + "epo": "eo", + "spa": "es", + "est": "et", + "eus": "eu", + "fas": "fa", + "ful": "ff", + "fin": "fi", + "fij": "fj", + "fao": "fo", + "fra": "fr", + "fry": "fy", + "gle": "ga", + "gla": "gd", + "glg": "gl", + "grn": "gn", + "guj": "gu", + "glv": "gv", + "hau": "ha", + "heb": "he", + "hin": "hi", + "hmo": "ho", + "hrv": "hr", + "hat": "ht", + "hun": "hu", + "hye": "hy", + "her": "hz", + "ina": "ia", + "ind": "id", + "ile": "ie", + "ibo": "ig", + "iii": "ii", + "ipk": "ik", + "ido": "io", + "isl": "is", + "ita": "it", + "iku": "iu", + "jpn": "ja", + "jav": "jv", + "kat": "ka", + "kon": "kg", + "kik": "ki", + "kua": "kj", + "kaz": "kk", + "kal": "kl", + "khm": "km", + "kan": "kn", + "kor": "ko", + "kau": "kr", + "kas": "ks", + "kur": "ku", + "kom": "kv", + "cor": "kw", + "kir": "ky", + "lat": "la", + "ltz": "lb", + "lug": "lg", + "lim": "li", + "lin": "ln", + "lao": "lo", + "lit": "lt", + "lub": "lu", + "lav": "lv", + "mlg": "mg", + "mah": "mh", + "mri": "mi", + "mkd": "mk", + "mal": "ml", + "mon": "mn", + "mar": "mr", + "msa": "ms", + "mlt": "mt", + "mya": "my", + "nau": "na", + "nob": "nb", + "nde": "nd", + "nep": "ne", + "ndo": "ng", + "nld": "nl", + "nno": "nn", + "nor": "no", + "nbl": "nr", + "nav": "nv", + "nya": "ny", + "oci": "oc", + "oji": "oj", + "orm": "om", + "ori": "or", + "oss": "os", + "pan": "pa", + "pli": "pi", + "pol": "pl", + "pus": "ps", + "por": "pt", + "que": "qu", + "roh": "rm", + "run": "rn", + "ron": "ro", + "rus": "ru", + "kin": "rw", + "san": "sa", + "srd": "sc", + "snd": "sd", + "sme": "se", + "sag": "sg", + "hbs": "sh", + "sin": "si", + "slk": "sk", + "slv": "sl", + "smo": "sm", + "sna": "sn", + "som": "so", + "sqi": "sq", + "srp": "sr", + "ssw": "ss", + "sot": "st", + "sun": "su", + "swe": "sv", + "swa": "sw", + "tam": "ta", + "tel": "te", + "tgk": "tg", + "tha": "th", + "tir": "ti", + "tuk": "tk", + "tgl": "tl", + "tsn": "tn", + "ton": "to", + "tur": "tr", + "tso": "ts", + "tat": "tt", + "twi": "tw", + "tah": "ty", + "uig": "ug", + "ukr": "uk", + "urd": "ur", + "uzb": "uz", + "ven": "ve", + "vie": "vi", + "vol": "vo", + "wln": "wa", + "wol": "wo", + "xho": "xh", + "yid": "yi", + "yor": "yo", + "zha": "za", + "zho": "zh", + "zul": "zu", + "nds": "nds", + "mai": "mai", + "nso": "nso", + "ast": "ast", + "crh": "crh", + "csb": "csb", + "hne": "hne", + "hsb": "hsb", +} + +configs = { + "afr": ["msa", "tha", "khm", "ind", "vie"], + "ara": ["msa", "tha", "khm", "ind", "vie"], + "asm": ["msa", "tha", "khm", "ind", "vie"], + "ast": ["msa", "tha", "khm", "ind", "vie"], + "bel": ["msa", "tha", "khm", "ind", "vie"], + "bul": ["msa", "tha", "khm", "ind", "vie"], + "ben": ["msa", "tha", "khm", "ind", "vie"], + "bre": ["msa", "tha", "khm", "ind", "vie"], + "cat": ["msa", "tha", "khm", "ind", "vie"], + "crh": ["msa", "tha", "khm", "ind", "vie"], + "ces": ["msa", "tha", "khm", "ind", "vie"], + "csb": ["msa", "tha", "khm", "ind", "vie"], + "cym": ["msa", "tha", "khm", "ind", "vie"], + "dan": ["msa", "tha", "khm", "ind", "vie"], + "deu": ["msa", "tha", "khm", "ind", "vie"], + "ell": ["msa", "tha", "khm", "ind", "vie"], + "eng": ["msa", "tha", "khm", "ind", "vie"], + "epo": ["msa", "tha", "khm", "ind", "vie"], + "spa": ["msa", "tha", "khm", "ind", "vie"], + "est": ["msa", "tha", "khm", "ind", "vie"], + "eus": ["msa", "tha", "khm", "ind", "vie"], + "fas": ["msa", "tha", "khm", "ind", "vie"], + "fin": ["msa", "tha", "khm", "ind", "vie"], + "fra": ["msa", "tha", "khm", "ind", "vie"], + "fry": ["msa", "tha", "khm", "ind", "vie"], + "gle": ["msa", "tha", "khm", "ind", "vie"], + "glg": ["msa", "tha", "khm", "ind", "vie"], + "guj": ["msa", "tha", "khm", "ind", "vie"], + "hau": ["msa", "tha", "khm", "ind", "vie"], + "heb": ["msa", "tha", "khm", "ind", "vie"], + "hin": ["msa", "tha", "khm", "ind", "vie"], + "hne": ["msa", "tha", "khm", "ind", "vie"], + "hrv": ["msa", "tha", "khm", "ind", "vie"], + "hsb": ["msa", "tha", "khm", "ind", "vie"], + "hun": ["msa", "tha", "khm", "ind", "vie"], + "hye": ["msa", "tha", "khm", "ind", "vie"], + "ind": [ + "kan", + "pus", + "msa", + "slv", + "tur", + "rus", + "nld", + "mkd", + "jpn", + "ori", + "nep", + "xho", + "nds", + "lav", + "ukr", + "vie", + "mai", + "tam", + "ltz", + "isl", + "uzb", + "sme", + "lit", + "tgk", + "kat", + "mal", + "srp", + "wln", + "por", + "oci", + "kur", + "mar", + "sin", + "slk", + "kor", + "kaz", + "ron", + "nno", + "tha", + "khm", + "tel", + "ita", + "pol", + "swe", + "pan", + "nob", + ], + "isl": ["khm", "msa", "tha", "vie"], + "ita": ["khm", "msa", "tha", "vie"], + "jpn": ["khm", "msa", "tha", "vie"], + "kat": ["khm", "msa", "tha", "vie"], + "kaz": ["khm", "msa", "tha", "vie"], + "khm": [ + "kan", + "pus", + "msa", + "slv", + "tur", + "rus", + "kin", + "nld", + "mkd", + "ori", + "xho", + "nso", + "nep", + "nds", + "lav", + "ukr", + "vie", + "mai", + "tam", + "ltz", + "uzb", + "sme", + "lit", + "tgk", + "mlt", + "mal", + "srp", + "wln", + "por", + "oci", + "kur", + "mar", + "sin", + "slk", + "kor", + "ron", + "nno", + "tha", + "tel", + "pol", + "swe", + "pan", + "nob", + ], + "kan": ["msa", "tha", "vie"], + "kor": ["msa", "tha", "vie"], + "kur": ["msa", "tha", "vie"], + "ltz": ["msa", "tha", "vie"], + "lit": ["msa", "tha", "vie"], + "lav": ["msa", "tha", "vie"], + "mai": ["msa", "tha", "vie"], + "mkd": ["msa", "tha", "vie"], + "mal": ["msa", "tha", "vie"], + "mar": ["msa", "tha", "vie"], + "msa": [ + "pus", + "slv", + "tur", + "rus", + "kin", + "nld", + "ori", + "xho", + "nso", + "nep", + "nds", + "ukr", + "vie", + "tam", + "uzb", + "sme", + "tgk", + "mlt", + "srp", + "wln", + "por", + "oci", + "slk", + "sin", + "ron", + "nno", + "tha", + "tel", + "pol", + "swe", + "pan", + "nob", + ], + "mlt": ["tha"], + "nob": ["tha", "vie"], + "nds": ["tha", "vie"], + "nep": ["tha", "vie"], + "nld": ["tha", "vie"], + "nno": ["tha", "vie"], + "oci": ["tha", "vie"], + "ori": ["tha", "vie"], + "pan": ["tha", "vie"], + "pol": ["tha", "vie"], + "pus": ["tha", "vie"], + "por": ["tha", "vie"], + "ron": ["tha", "vie"], + "rus": ["tha", "vie"], + "kin": ["tha"], + "sme": ["tha", "vie"], + "sin": ["tha", "vie"], + "slk": ["tha", "vie"], + "slv": ["tha", "vie"], + "srp": ["tha", "vie"], + "swe": ["tha", "vie"], + "tam": ["tha", "vie"], + "tel": ["tha", "vie"], + "tgk": ["tha", "vie"], + "tha": ["wln", "tur", "uzb", "xho", "vie", "ukr"], + "tur": ["vie"], + "ukr": ["vie"], + "uzb": ["vie"], + "vie": ["wln", "xho"], +} + + +class KDE4Dataset(datasets.GeneratorBasedBuilder): + """A parallel corpus of KDE4 localization files. The corpus is available in 92 languages in total, with 4099 bitexts.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + [ + SEACrowdConfig( + name="kde4_source", + version=datasets.Version(_SOURCE_VERSION), + description="kde4 source schema for afr to msa", + schema="source", + subset_id="afr_msa", + ), + SEACrowdConfig( + name="kde4_seacrowd_t2t", + version=datasets.Version(_SOURCE_VERSION), + description="kde4 seacrowd_t2t schema for afr to msa", + schema="seacrowd_t2t", + subset_id="afr_msa", + ), + ] + + [ + SEACrowdConfig( + name=f"kde4_{src_lang}_{tgt_lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"kde4 source schema for {src_lang} to {tgt_lang}", + schema="source", + subset_id=f"{src_lang}_{tgt_lang}", + ) + for src_lang in configs + for tgt_lang in configs[src_lang] + ] + + [ + SEACrowdConfig( + name=f"kde4_{src_lang}_{tgt_lang}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"kde4 seacrowd_t2t schema for {src_lang} to {tgt_lang}", + schema="seacrowd_t2t", + subset_id=f"{src_lang}_{tgt_lang}", + ) + for src_lang in configs + for tgt_lang in configs[src_lang] + ] + ) + + DEFAULT_CONFIG_NAME = "kde4_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "src_text": datasets.Value("string"), + "tgt_text": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + src_lang, tgt_lang = self.config.subset_id.split("_") + kde4_src_lang, kde4_tgt_lang = kde4_language_codes[src_lang], kde4_language_codes[tgt_lang] + + url = _URL_TEMPLATE.format(src=kde4_src_lang, tgt=kde4_tgt_lang) + data_dir = dl_manager.download_and_extract(url) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "src_lang": src_lang, + "tgt_lang": tgt_lang, + "src_filepath": os.path.join(data_dir, f"KDE4.{kde4_src_lang}-{kde4_tgt_lang}.{kde4_src_lang}"), + "tgt_filepath": os.path.join(data_dir, f"KDE4.{kde4_src_lang}-{kde4_tgt_lang}.{kde4_tgt_lang}"), + }, + ) + ] + + def _generate_examples(self, src_lang: str, tgt_lang: str, src_filepath: Path, tgt_filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + for row_id, (src_text, tgt_text) in enumerate(zip(open(src_filepath), open(tgt_filepath))): + yield row_id, { + "id": row_id, + "src_text": src_text.strip(), + "tgt_text": tgt_text.strip(), + } + + elif self.config.schema == "seacrowd_t2t": + for row_id, (src_text, tgt_text) in enumerate(zip(open(src_filepath), open(tgt_filepath))): + yield row_id, { + "id": row_id, + "text_1": src_text.strip(), + "text_2": tgt_text.strip(), + "text_1_name": src_lang, + "text_2_name": tgt_lang, + } diff --git a/seacrowd/sea_datasets/kheng_info/__init__.py b/seacrowd/sea_datasets/kheng_info/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/kheng_info/kheng_info.py b/seacrowd/sea_datasets/kheng_info/kheng_info.py new file mode 100644 index 000000000..37e2d159b --- /dev/null +++ b/seacrowd/sea_datasets/kheng_info/kheng_info.py @@ -0,0 +1,113 @@ +# coding=utf-8 + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# no bibtex citation +_CITATION = "" + +_DATASETNAME = "kheng_info" + +_DESCRIPTION = """\ +The Kheng.info Speech dataset was derived from recordings of Khmer words on the Khmer dictionary website kheng.info. +The recordings were recorded by a native Khmer speaker. +The recordings are short, generally ranging between 1 to 2 seconds only. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/seanghay/khmer_kheng_info_speech" + +_LANGUAGES = ["khm"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://huggingface.co/datasets/seanghay/khmer_kheng_info_speech/resolve/main/data/train-00000-of-00001-4e7ad082a34164d1.parquet", +} + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class KhengInfoDataset(datasets.GeneratorBasedBuilder): + """This is the Kheng.info Speech dataset, which wasderived from recordings on the Khmer dictionary website kheng.info""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_sptext", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_sptext", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"word": datasets.Value("string"), "duration_ms": datasets.Value("int64"), "audio": datasets.Audio(sampling_rate=16_000)}) + + elif self.config.schema == "seacrowd_sptext": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + }, + ) + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + df = pd.read_parquet(filepath, engine="pyarrow") + if self.config.schema == "source": + for _id, row in df.iterrows(): + yield _id, {"word": row["word"], "duration_ms": row["duration_ms"], "audio": row["audio"]} + elif self.config.schema == "seacrowd_sptext": + for _id, row in df.iterrows(): + yield _id, { + "id": _id, + "path": row["audio"], + "audio": row["audio"], + "text": row["word"], + "speaker_id": None, + "metadata": { + "speaker_age": None, + "speaker_gender": None, + }, + } diff --git a/seacrowd/sea_datasets/khpos/__init__.py b/seacrowd/sea_datasets/khpos/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/khpos/khpos.py b/seacrowd/sea_datasets/khpos/khpos.py new file mode 100644 index 000000000..550bc0e80 --- /dev/null +++ b/seacrowd/sea_datasets/khpos/khpos.py @@ -0,0 +1,212 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The khPOS Corpus (Khmer POS Corpus) is a 12,000 sentences (25,626 words) manually word segmented and POS tagged corpus +developed for Khmer language NLP research and developments. We collected Khmer sentences from websites that include +various area such as economics, news, politics. Moreover it is also contained some student list and voter list of +national election committee of Cambodia. The average number of words per sentence in the whole corpus is 10.75. +Here, some symbols such as "។" (Khmer sign Khan), "៖" (Khmer sign Camnuc pii kuuh), "-", "?", "[", "]" etc. also +counted as words. The shortest sentence contained only 1 word and longest sentence contained 169 words. This dataset contains +A validation set and a test set, each containing 1000 sentences. +""" +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@inproceedings{kyaw2017comparison, + title={Comparison of Six POS Tagging Methods on 12K Sentences Khmer Language POS Tagged Corpus}, + author={Ye Kyaw Thu and Vichet Chea and Yoshinori Sagisaka}, + booktitle={Proceedings of the first Regional Conference on Optical character recognition and Natural language processing technologies for ASEAN languages (ONA 2017)}, + year={2017}, + month={December 7-8}, + address={Phnom Penh, Cambodia} +} +""" + +_DATASETNAME = "khpos" + +_DESCRIPTION = """\ +The khPOS Corpus (Khmer POS Corpus) is a 12,000 sentences (25,626 words) manually word segmented and POS tagged corpus +developed for Khmer language NLP research and developments. We collected Khmer sentences from websites that include +various area such as economics, news, politics. Moreover it is also contained some student list and voter list of +national election committee of Cambodia. The average number of words per sentence in the whole corpus is 10.75. +Here, some symbols such as "។" (Khmer sign Khan), "៖" (Khmer sign Camnuc pii kuuh), "-", "?", "[", "]" etc. also +counted as words. The shortest sentence contained only 1 word and longest sentence contained 169 words. This dataset contains +A validation set and a test set, each containing 1000 sentences. +""" + +_HOMEPAGE = "https://github.com/ye-kyaw-thu/khPOS/tree/master" + +_LANGUAGES = ['khm'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + 'train': "https://raw.githubusercontent.com/ye-kyaw-thu/khPOS/master/corpus-draft-ver-1.0/data/after-replace/train.all2", + 'validation': "https://raw.githubusercontent.com/ye-kyaw-thu/khPOS/master/corpus-draft-ver-1.0/data/OPEN-TEST", + 'test': "https://raw.githubusercontent.com/ye-kyaw-thu/khPOS/master/corpus-draft-ver-1.0/data/CLOSE-TEST" + } +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class KhPOS(datasets.GeneratorBasedBuilder): + """\ +This datasets contain 12000 sentences (25626 words) for the Khmer language. +There are 24 POS tags and their description can be found at https://github.com/ye-kyaw-thu/khPOS/tree/master. +The used Khmer Tokenizer can be found in the above github repository as well. This dataset contains +A validation set and a test set, each containing 1000 sentences. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="khpos_source", + version=SOURCE_VERSION, + description="khpos source schema", + schema="source", + subset_id="khpos", + ), + SEACrowdConfig( + name="khpos_seacrowd_seq_label", + version=SEACROWD_VERSION, + description="khpos SEACrowd schema", + schema="seacrowd_seq_label", + subset_id="khpos", + ), + ] + + DEFAULT_CONFIG_NAME = "khpos_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({ + "id" : datasets.Value("string"), + "tokens" : datasets.Sequence(datasets.Value("string")), + #pos_tags follows order from corpus-draft-ver-1.0/data/after-replace/train.all2.tag.freq + "pos_tags": datasets.Sequence(datasets.features.ClassLabel( + names = [ + 'AB', 'AUX', 'CC', 'CD', + 'DBL', 'DT', 'ETC', 'IN', + 'JJ', 'KAN', 'M', 'NN', + 'PA', 'PN', 'PRO', 'QT', + 'RB', 'RPN', 'SYM', 'UH', + 'VB', 'VB_JJ', 'VCOM' + ] + )) + }) + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label.features([ + 'AB', 'AUX', 'CC', 'CD', + 'DBL', 'DT', 'ETC', 'IN', + 'JJ', 'KAN', 'M', 'NN', + 'PA', 'PN', 'PRO', 'QT', + 'RB', 'RPN', 'SYM', 'UH', + 'VB', 'VB_JJ', 'VCOM' + ]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME]['train'] + path = dl_manager.download_and_extract(urls) + + dev_url = _URLS[_DATASETNAME]['validation'] + dev_path = dl_manager.download_and_extract(dev_url) + + test_url = _URLS[_DATASETNAME]['test'] + test_path = dl_manager.download_and_extract(test_url) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": path, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": dev_path, + "split": "dev", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": test_path, + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(filepath, encoding="utf-8") as file: + counter = 0 + for line in file: + if line.strip() != "": + groups = line.split(" ") + tokens = [] + pos_tags = [] + for group in groups: + token, pos_tag = group.split("/") + tokens.append(token) + pos_tags.append(pos_tag) + if self.config.schema == "source": + yield ( + counter, + { + "id" : str(counter), + "tokens" : tokens, + "pos_tags": pos_tags + } + ) + counter += 1 + elif self.config.schema == "seacrowd_seq_label": + yield ( + counter, + { + "id" : str(counter), + "tokens": tokens, + "labels": pos_tags + } + ) + counter += 1 diff --git a/seacrowd/sea_datasets/lazada_review_filipino/__init__.py b/seacrowd/sea_datasets/lazada_review_filipino/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/lazada_review_filipino/lazada_review_filipino.py b/seacrowd/sea_datasets/lazada_review_filipino/lazada_review_filipino.py new file mode 100644 index 000000000..29266c836 --- /dev/null +++ b/seacrowd/sea_datasets/lazada_review_filipino/lazada_review_filipino.py @@ -0,0 +1,147 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Filipino-Tagalog Product Reviews Sentiment Analysis +This is a machine learning dataset that can be used to analyze the sentiment of product reviews in Filipino-Tagalog. +The data is scraped from lazada Philippines. +""" +import os +from pathlib import Path +from typing import Dict, List, Tuple +import json + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + + +_CITATION = """@misc{github, + author={Eric Echemane}, + title={Filipino-Tagalog-Product-Reviews-Sentiment-Analysis}, + year={2022}, + url={https://github.com/EricEchemane/Filipino-Tagalog-Product-Reviews-Sentiment-Analysis/tree/main}, +} +""" + +_DATASETNAME = "lazada_review_filipino" + + +_DESCRIPTION = """Filipino-Tagalog Product Reviews Sentiment Analysis +This is a machine learning dataset that can be used to analyze the sentiment of product reviews in Filipino-Tagalog. +The dataset contains over 900+ weakly annotated Filipino reviews scraped from the Lazada Philippines platform. +Each review is associated with a five star point rating where one is the lowest and five is the highest. +""" + + +_HOMEPAGE = "https://github.com/EricEchemane/Filipino-Tagalog-Product-Reviews-Sentiment-Analysis" + +_LANGUAGES = ['fil', 'tgl'] + +_LICENSE = Licenses.UNKNOWN.value + + +_LOCAL = False + + + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/EricEchemane/Filipino-Tagalog-Product-Reviews-Sentiment-Analysis/main/data/reviews.json", +} + + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] + + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + + +class LazadaReviewFilipinoDataset(datasets.GeneratorBasedBuilder): + """The dataset contains over 900+ weakly annotated Filipino reviews scraped from the Lazada Philippines platform""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="lazada_review_filipino_source", + version=SOURCE_VERSION, + description="lazada reviews in filipino source schema", + schema="source", + subset_id="lazada_review_filipino", + ), + SEACrowdConfig( + name="lazada_review_filipino_seacrowd_text", + version=SEACROWD_VERSION, + description="lazada reviews in filipino SEACrowd schema", + schema="seacrowd_text", + subset_id="lazada_review_filipino", + ), + ] + + DEFAULT_CONFIG_NAME = "lazada_review_filipino_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"index": datasets.Value("string"), "review": datasets.Value("string"), + "rating": datasets.Value("string")}) + + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(label_names=["1", "2", "3", "4", "5"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(filepath, 'r') as file: + data = json.load(file) + + if self.config.schema == "source": + for i in range(len(data)): + yield i, {"index": str(i), "review": data[i]['review'], "rating": data[i]['rating']} + + elif self.config.schema == "seacrowd_text": + for i in range(len(data)): + yield i, {"id": str(i), "text": data[i]['review'], "label": str(data[i]['rating'])} diff --git a/seacrowd/sea_datasets/lr_sum/__init__.py b/seacrowd/sea_datasets/lr_sum/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/lr_sum/lr_sum.py b/seacrowd/sea_datasets/lr_sum/lr_sum.py new file mode 100644 index 000000000..9fe440b57 --- /dev/null +++ b/seacrowd/sea_datasets/lr_sum/lr_sum.py @@ -0,0 +1,166 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproceedings{palen-michel-lignos-2023-lr, + author = {Palen-Michel, Chester and Lignos, Constantine}, + title = {LR - Sum: Summarization for Less-Resourced Languages}, + booktitle = {Findings of the Association for Computational Linguistics: ACL 2023}, + year = {2023}, + publisher = {Association for Computational Linguistics}, + address = {Toronto, Canada}, + doi = {10.18653/v1/2023.findings-acl.427}, + pages = {6829--6844}, +} +""" + +_LOCAL = False +_LANGUAGES = ["ind", "khm", "lao", "mya", "tha", "vie"] + +_DATASETNAME = "lr_sum" +_DESCRIPTION = """ +LR-Sum is a news abstractive summarization dataset focused on low-resource languages. It contains human-written summaries +for 39 languages and the data is based on the Multilingual Open Text corpus +(ultimately derived from the Voice of America website). +""" + +_HOMEPAGE = "https://huggingface.co/datasets/bltlab/lr-sum" +_LICENSE = Licenses.CC_BY_4_0.value +_URL = "https://huggingface.co/datasets/bltlab/lr-sum" + +_SUPPORTED_TASKS = [Tasks.SUMMARIZATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class LRSumDataset(datasets.GeneratorBasedBuilder): + """Dataset of article-summary pairs for different low-resource languages.""" + + # Config to load individual datasets per language + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for {lang} language", + schema="source", + subset_id=f"{_DATASETNAME}_{lang}", + ) + for lang in _LANGUAGES + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for {lang} language", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_{lang}", + ) + for lang in _LANGUAGES + ] + + # Config to load all datasets + BUILDER_CONFIGS.extend( + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for all languages", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for all languages", + schema="seacrowd_t2t", + subset_id=_DATASETNAME, + ), + ] + ) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "url": datasets.Value("string"), + "title": datasets.Value("string"), + "summary": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # dl_manager not used since dataloader uses HF 'load_dataset' + return [ + datasets.SplitGenerator(name=split, gen_kwargs={"split": split._name}) + for split in ( + datasets.Split.TRAIN, + datasets.Split.VALIDATION, + datasets.Split.TEST, + ) + ] + + def _load_hf_data_from_remote(self, lang: str, split: str) -> datasets.DatasetDict: + """Load dataset from HuggingFace.""" + hf_remote_ref = "/".join(_URL.split("/")[-2:]) + return datasets.load_dataset(hf_remote_ref, lang, split=split) + + def _generate_examples(self, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + lr_sum_datasets = [] + + lang = self.config.subset_id.split("_")[-1] + if lang in _LANGUAGES: + lr_sum_datasets.append(self._load_hf_data_from_remote(lang, split)) + else: + for lang in _LANGUAGES: + lr_sum_datasets.append(self._load_hf_data_from_remote(lang, split)) + + index = 0 + for lang_subset in lr_sum_datasets: + for row in lang_subset: + if self.config.schema == "source": + example = row + + elif self.config.schema == "seacrowd_t2t": + example = { + "id": str(index), + "text_1": row["text"], + "text_2": row["summary"], + "text_1_name": "document", + "text_2_name": "summary", + } + yield index, example + index += 1 diff --git a/seacrowd/sea_datasets/m3exam/__init__.py b/seacrowd/sea_datasets/m3exam/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/m3exam/m3exam.py b/seacrowd/sea_datasets/m3exam/m3exam.py new file mode 100644 index 000000000..a604171b7 --- /dev/null +++ b/seacrowd/sea_datasets/m3exam/m3exam.py @@ -0,0 +1,308 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re +import zipfile +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{zhang2023m3exam, + title={M3Exam: A Multilingual, Multimodal, Multilevel Benchmark for Examining Large Language Models}, + author={Wenxuan Zhang and Sharifah Mahani Aljunied and Chang Gao and Yew Ken Chia and Lidong Bing}, + year={2023}, + eprint={2306.05179}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_DATASETNAME = "m3exam" + +_DESCRIPTION = """\ +M3Exam is a novel benchmark sourced from real and official human exam questions for evaluating LLMs\ +in a multilingual, multimodal, and multilevel context. In total, M3Exam contains 12,317 questions in 9\ +diverse languages with three educational levels, where about 23% of the questions require processing images\ +for successful solving. M3Exam dataset covers 3 languages spoken in Southeast Asia. +""" + +_HOMEPAGE = "https://github.com/DAMO-NLP-SG/M3Exam" + +_LANGUAGES = ["jav", "tha", "vie"] +_LANG_MAPPER = {"jav": "javanese", "tha": "thai", "vie": "vietnamese"} +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value + +_LOCAL = False +_PASSWORD = "12317".encode("utf-8") # password to unzip dataset after downloading +_URLS = { + _DATASETNAME: "https://drive.usercontent.google.com/download?id=1eREETRklmXJLXrNPTyHxQ3RFdPhq_Nes&authuser=0&confirm=t", +} + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING, Tasks.VISUAL_QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class M3ExamDataset(datasets.GeneratorBasedBuilder): + """ + M3Exam is a novel benchmark sourced from real and official human exam questions for evaluating LLMs + in a multilingual, multimodal, and multilevel context. In total, M3Exam contains 12,317 questions in 9 + diverse languages with three educational levels, where about 23% of the questions require processing images + for successful solving. M3Exam dataset covers 3 languages spoken in Southeast Asia. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + [SEACrowdConfig(name=f"{_DATASETNAME}_{lang}_source", version=datasets.Version(_SOURCE_VERSION), description=f"{_DATASETNAME} source schema", schema="source", subset_id=f"{_DATASETNAME}") for lang in _LANGUAGES] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_seacrowd_qa", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{_DATASETNAME}", + ) + for lang in _LANGUAGES + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_seacrowd_imqa", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_imqa", + subset_id=f"{_DATASETNAME}", + ) + for lang in _LANGUAGES + ] + ) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_jav_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "question_text": datasets.Value("string"), + "background_description": datasets.Sequence(datasets.Value("string")), + "answer_text": datasets.Value("string"), + "options": datasets.Sequence(datasets.Value("string")), + "language": datasets.Value("string"), + "level": datasets.Value("string"), + "subject": datasets.Value("string"), + "subject_category": datasets.Value("string"), + "year": datasets.Value("string"), + "need_image": datasets.Value("string"), + "image_paths": datasets.Sequence(datasets.Value("string")), + } + ) + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + features["meta"] = { + "background_description": datasets.Sequence(datasets.Value("string")), + "level": datasets.Value("string"), + "subject": datasets.Value("string"), + "subject_category": datasets.Value("string"), + "year": datasets.Value("string"), + } + elif self.config.schema == "seacrowd_imqa": + features = schemas.imqa_features + features["meta"] = { + "background_description": datasets.Sequence(datasets.Value("string")), + "level": datasets.Value("string"), + "subject": datasets.Value("string"), + "subject_category": datasets.Value("string"), + "year": datasets.Value("string"), + } + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + lang = self.config.name.split("_")[1] + + data_dir = dl_manager.download(urls) + + if not os.path.exists(data_dir + "_extracted"): + if not os.path.exists(data_dir + ".zip"): + os.rename(data_dir, data_dir + ".zip") + with zipfile.ZipFile(data_dir + ".zip", "r") as zip_ref: + zip_ref.extractall(data_dir + "_extracted", pwd=_PASSWORD) # unzipping with password + if not os.path.exists(data_dir): + os.rename(data_dir + ".zip", data_dir) + image_generator = [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir + "_extracted", "data/multimodal-question"), + "split": "train", + }, + ), + ] + + text_generator = [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir + "_extracted", f"data/text-question/{_LANG_MAPPER[lang]}-questions-test.json"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir + "_extracted", f"data/text-question/{_LANG_MAPPER[lang]}-questions-dev.json"), + "split": "dev", + }, + ), + ] + if "imqa" in self.config.name: + return image_generator + else: + if "source" in self.config.name: + image_generator.extend(text_generator) + return image_generator + else: + return text_generator + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + lang = self.config.name.split("_")[1] + if self.config.schema == "source": + if split == "train": + filepath_json = os.path.join(filepath, f"{_LANG_MAPPER[lang]}-questions-image.json") + with open(filepath_json, "r") as file: + data = json.load(file) + idx = 0 + for json_obj in data: + image_paths = [] + for text in [json_obj["question_text"]] + json_obj["options"] + json_obj["background_description"]: + matches = re.findall(r"\[image-(\d+)\.(jpg|png)\]", text) + if matches: + image_path = [os.path.join(filepath, f"images-{_LANG_MAPPER[lang]}/image-{image_number[0]}.{image_number[1]}") for image_number in matches] + image_paths.extend(image_path) + example = { + "question_text": json_obj["question_text"], + "background_description": json_obj["background_description"] if "background_description" in json_obj.keys() else None, + "answer_text": json_obj["answer_text"], + "options": json_obj["options"], + "language": json_obj["language"] if "language" in json_obj.keys() else None, + "level": json_obj["level"] if "level" in json_obj.keys() else None, + "subject": json_obj["subject"] if "subject" in json_obj.keys() else None, + "subject_category": json_obj["subject_category"] if "subject_category" in json_obj.keys() else None, + "year": json_obj["year"] if "year" in json_obj.keys() else None, + "need_image": "yes", + "image_paths": image_paths, + } + yield idx, example + idx += 1 + else: + with open(filepath, "r") as file: + data = json.load(file) + idx = 0 + for json_obj in data: + example = { + "question_text": json_obj["question_text"], + "background_description": json_obj["background_description"] if "background_description" in json_obj.keys() else None, + "answer_text": json_obj["answer_text"], + "options": json_obj["options"], + "language": json_obj["language"] if "language" in json_obj.keys() else None, + "level": json_obj["level"] if "level" in json_obj.keys() else None, + "subject": json_obj["subject"] if "subject" in json_obj.keys() else None, + "subject_category": json_obj["subject_category"] if "subject_category" in json_obj.keys() else None, + "year": json_obj["year"] if "year" in json_obj.keys() else None, + "need_image": "no", + "image_paths": None, + } + yield idx, example + idx += 1 + + elif self.config.schema == "seacrowd_qa": + with open(filepath, "r") as file: + data = json.load(file) + idx = 0 + for json_obj in data: + example = { + "id": idx, + "question_id": idx, + "document_id": idx, + "question": json_obj["question_text"], + "type": "multiple_choice", + "choices": [". ".join(answer.split(". ")[1:]) for answer in json_obj["options"]], + "context": "", + "answer": [". ".join(answer.split(". ")[1:]) for answer in json_obj["options"] if json_obj["answer_text"] == answer[0]], + "meta": { + "background_description": json_obj["background_description"] if "background_description" in json_obj.keys() else None, + "level": json_obj["level"] if "level" in json_obj.keys() else None, + "subject": json_obj["subject"] if "subject" in json_obj.keys() else None, + "subject_category": json_obj["subject_category"] if "subject_category" in json_obj.keys() else None, + "year": json_obj["year"] if "year" in json_obj.keys() else None, + }, + } + yield idx, example + idx += 1 + + elif self.config.schema == "seacrowd_imqa": + filepath_json = os.path.join(filepath, f"{_LANG_MAPPER[lang]}-questions-image.json") + with open(filepath_json, "r") as file: + data = json.load(file) + idx = 0 + for json_obj in data: + image_paths = [] + for text in [json_obj["question_text"]] + json_obj["options"] + json_obj["background_description"]: + matches = re.findall(r"\[image-(\d+)\.(jpg|png)\]", text) + if matches: + image_path = [os.path.join(filepath, f"images-{_LANG_MAPPER[lang]}/image-{image_number[0]}.{image_number[1]}") for image_number in matches] + image_paths.extend(image_path) + + example = { + "id": idx, + "question_id": idx, + "document_id": idx, + "questions": [json_obj["question_text"]], + "type": "multiple_choice", + "choices": [". ".join(answer.split(". ")[1:]) for answer in json_obj["options"]], + "context": "", + "answer": [". ".join(answer.split(". ")[1:]) for answer in json_obj["options"] if json_obj["answer_text"] == answer[0]], + "image_paths": image_paths, + "meta": { + "background_description": json_obj["background_description"] if "background_description" in json_obj.keys() else None, + "level": json_obj["level"] if "level" in json_obj.keys() else None, + "subject": json_obj["subject"] if "subject" in json_obj.keys() else None, + "subject_category": json_obj["subject_category"] if "subject_category" in json_obj.keys() else None, + "year": json_obj["year"] if "year" in json_obj.keys() else None, + }, + } + yield idx, example + idx += 1 diff --git a/seacrowd/sea_datasets/malaysia_tweets/__init__.py b/seacrowd/sea_datasets/malaysia_tweets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/malaysia_tweets/malaysia_tweets.py b/seacrowd/sea_datasets/malaysia_tweets/malaysia_tweets.py new file mode 100644 index 000000000..edca7e66d --- /dev/null +++ b/seacrowd/sea_datasets/malaysia_tweets/malaysia_tweets.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@InProceedings{10.1007/978-981-16-8515-6_44, +author="Juan, Sarah Samson +and Saee, Suhaila +and Mohamad, Fitri", +editor="Alfred, Rayner +and Lim, Yuto", +title="Social Versus Physical Distancing: Analysis of Public Health Messages at the Start of COVID-19 Outbreak in Malaysia Using Natural Language Processing", +booktitle="Proceedings of the 8th International Conference on Computational Science and Technology", +year="2022", +publisher="Springer Singapore", +address="Singapore", +pages="577--589", +abstract="The study presents an attempt to analyse how social media netizens in Malaysia responded to the calls for ``Social Distancing'' and ``Physical Distancing'' as the newly recommended social norm was introduced to the world +as a response to the COVID-19 global pandemic. The pandemic drove a sharp increase in social media platforms' use as a public health communication platform since the first wave of the COVID-19 outbreak in Malaysia in April 2020. +We analysed thousands of tweets posted by Malaysians daily between January 2020 and August 2021 to determine public perceptions and interactions patterns. The analysis focused on positive and negative reactions +and the interchanges of uses of the recommended terminologies ``social distancing'' and ``physical distancing''. Using linguistic analysis and natural language processing, +findings dominantly indicate influences from the multilingual and multicultural values held by Malaysian netizens, as they embrace the concept of distancing as a measure of global public health safety.", +isbn="978-981-16-8515-6" +} +""" + +_DATASETNAME = "malaysia_tweets" +_DESCRIPTION = """\ +This tweet data was extracted from tweets in Malaysia based on keywords +"social distancing" and "physical distancing". We conducted +sentiment analysis to understand public opinions on health messages +during the COVID-19 pandemic. Tweets from January 2020 to July 2021 +were extracted using Python module snscrape and sentiments were obtained +automatically using Polyglot and MALAYA NLP tools due to multilingual data. +""" + +_HOMEPAGE = "https://github.com/sarahjuan/malaysia-tweets-with-sentiment-labels" + +_LANGUAGES = ["zlm,", "eng"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.UNKNOWN.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/sarahjuan/malaysia-tweets-with-sentiment-labels/main/data/cleaned_tweets_sentiments.csv", +} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MalaysiaTweetsDataset(datasets.GeneratorBasedBuilder): + """This tweet data was extracted from tweets in Malaysia based on keywords + "social distancing" and "physical distancing" from January 2020 to July 2021.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + SENTIMENT_LABEL_CLASSES = ["POSITIVE", "NEGATIVE", "NEUTRAL"] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "Tweet": datasets.Value("string"), + "Sentiment": datasets.ClassLabel(names=self.SENTIMENT_LABEL_CLASSES), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(self.SENTIMENT_LABEL_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + df = pd.read_csv(filepath, encoding="utf-8") + if self.config.schema == "source": + for idx, row in df.iterrows(): + yield idx, dict(row) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for idx, row in df.iterrows(): + yield idx, {"id": idx, "text": row["Tweet"], "label": row["Sentiment"]} diff --git a/seacrowd/sea_datasets/malindo_morph/__init__.py b/seacrowd/sea_datasets/malindo_morph/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/malindo_morph/malindo_morph.py b/seacrowd/sea_datasets/malindo_morph/malindo_morph.py new file mode 100644 index 000000000..28af10410 --- /dev/null +++ b/seacrowd/sea_datasets/malindo_morph/malindo_morph.py @@ -0,0 +1,124 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses + +_CITATION = """\ +@InProceedings{NOMOTO18.8, + author = {Hiroki Nomoto ,Hannah Choi ,David Moeljadi and Francis Bond}, + title = {MALINDO Morph: Morphological dictionary and analyser for Malay/Indonesian}, + booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, + year = {2018}, + month = {may}, + date = {7-12}, + location = {Miyazaki, Japan}, + editor = {Kiyoaki Shirai}, + publisher = {European Language Resources Association (ELRA)}, + address = {Paris, France}, + isbn = {979-10-95546-24-5}, + language = {english} + } +""" + + +_DATASETNAME = "malindo_morph" + +_DESCRIPTION = """\ +MALINDO Morph is a morphological dictionary for Malay (bahasa Melayu) and Indonesian (bahasa Indonesia) language. +It contains over 200,000 lines, with each containing an analysis for one (case-sensitive) token. +Each line is made up of the following six items, separated by tabs: root, surface form, prefix, suffix, circumfix, reduplication. +""" + +_HOMEPAGE = "https://github.com/matbahasa/MALINDO_Morph" + +_LANGUAGES = ["zlm", "ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CC_BY_4_0.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/matbahasa/MALINDO_Morph/master/malindo_dic_2023.tsv", +} + +_SUPPORTED_TASKS = [] + +_SOURCE_VERSION = "2023.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MalindoMorph(datasets.GeneratorBasedBuilder): + """MALINDO Morph is a morphological dictionary for Malay (bahasa Melayu) and Indonesian (bahasa Indonesia) language. It provides morphological information (root, prefix, suffix, circumfix, reduplication) for over 200,000 surface forms.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "root": datasets.Value("string"), + "bentuk_jadian": datasets.Value("string"), + "prefix": datasets.Value("string"), + "suffix": datasets.Value("string"), + "circumfix": datasets.Value("string"), + "reduplication": datasets.Value("string"), + "source": datasets.Value("string"), + "stem": datasets.Value("string"), + "lemma": datasets.Value("string"), + } + ) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + file = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": file, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + rows = [] + with open(filepath, encoding="utf8") as file: + for line in file: + row = line.split("\t") + row[-1] = row[-1].split("\n")[0] # remove newlines from lemma feature + rows.append(row) + + if self.config.schema == "source": + for key, row in enumerate(rows): + example = {"id": row[0], "root": row[1], "bentuk_jadian": row[2], "prefix": row[3], "suffix": row[4], "circumfix": row[5], "reduplication": row[6], "source": row[7], "stem": row[8], "lemma": row[9]} + yield key, example diff --git a/seacrowd/sea_datasets/malindo_parallel/__init__.py b/seacrowd/sea_datasets/malindo_parallel/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py new file mode 100644 index 000000000..e72951010 --- /dev/null +++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py @@ -0,0 +1,196 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the SEACrowd Datahub repo. + + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/add_dataset.html + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +""" +import json +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, + DEFAULT_SOURCE_VIEW_NAME, Tasks) + +_CITATION = """\ +@misc{MALINDO-parallel, + title = "MALINDO-parallel", + howpublished = "https://github.com/matbahasa/MALINDO_Parallel/blob/master/README.md", + note = "Accessed: 2023-01-27", +} +""" + +_DATASETNAME = "malindo_parallel" + + +_DESCRIPTION = """\ +Teks ini adalah skrip video untuk Kampus Terbuka Universiti Bahasa Asing Tokyo pada tahun 2020. Tersedia parallel sentences dalam Bahasa Melayu/Indonesia dan Bahasa Jepang +""" + + +_HOMEPAGE = "https://github.com/matbahasa/MALINDO_Parallel/tree/master/OpenCampusTUFS" + + +_LANGUAGES = ["zlm", "jpn"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + + +_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" + + +_LOCAL = False + + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/matbahasa/MALINDO_Parallel/master/OpenCampusTUFS/OCTUFS2020.txt", +} + + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + + +class MalindoParallelDataset(datasets.GeneratorBasedBuilder): + """Data terjemahan bahasa Melayu/Indonesia""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="malindo_parallel_source", + version=SOURCE_VERSION, + description="malindo_parallel source schema", + schema="source", + subset_id="malindo_parallel", + ), + SEACrowdConfig( + name="malindo_parallel_seacrowd_t2t", + version=SEACROWD_VERSION, + description="malindo_parallel SEACrowd schema", + schema="seacrowd_t2t", + subset_id="malindo_parallel", + ), + ] + + DEFAULT_CONFIG_NAME = "malindo_parallel_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string")}) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + + rows = [] + temp_cols = None + with open(filepath) as file: + while line := file.readline(): + if temp_cols is None: + cols = [] + for col in line.split('\t'): + if len(col.strip('\n'))>0: + cols.append(col) + if len(cols) > 2: + correct_line = line.rstrip() + rows.append(correct_line) + else: + temp_cols = cols + else: + temp_cols.append(line) + correct_line = "\t".join(temp_cols).rstrip() + temp_cols = None + rows.append(correct_line) + + if self.config.schema == "source": + + for i, row in enumerate(rows): + t1idx = row.find("\t") + 1 + t2idx = row[t1idx:].find("\t") + row_id = row[:t1idx] + row_melayu = row[t1idx : t1idx + t2idx] + row_japanese = row[t1idx + t2idx + 1 : -1] + ex = {"id": row_id.rstrip(), + "text": row_melayu + "\t" + row_japanese} + yield i, ex + + elif self.config.schema == "seacrowd_t2t": + + for i, row in enumerate(rows): + t1idx = row.find("\t") + 1 + t2idx = row[t1idx:].find("\t") + row_id = row[:t1idx] + row_melayu = row[t1idx : t1idx + t2idx] + row_japanese = row[t1idx + t2idx + 1 : -1] + ex = { + "id": row_id.rstrip(), + "text_1": row_melayu, + "text_2": row_japanese, + "text_1_name": "zlm", + "text_2_name": "jpn", + } + yield i, ex + + + +if __name__ == "__main__": + datasets.load_dataset(__file__) diff --git a/seacrowd/sea_datasets/massive/__init__.py b/seacrowd/sea_datasets/massive/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/massive/massive.py b/seacrowd/sea_datasets/massive/massive.py new file mode 100644 index 000000000..6b2fceb5d --- /dev/null +++ b/seacrowd/sea_datasets/massive/massive.py @@ -0,0 +1,580 @@ +import json +from typing import List + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@misc{fitzgerald2022massive, + title={MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages}, + author={Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron + Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter + Leeuwis and Gokhan Tur and Prem Natarajan}, + year={2022}, + eprint={2204.08582}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +@inproceedings{bastianelli-etal-2020-slurp, + title = "{SLURP}: A Spoken Language Understanding Resource Package", + author = "Bastianelli, Emanuele and + Vanzo, Andrea and + Swietojanski, Pawel and + Rieser, Verena", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", + month = nov, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.emnlp-main.588", + doi = "10.18653/v1/2020.emnlp-main.588", + pages = "7252--7262", + abstract = "Spoken Language Understanding infers semantic meaning directly from audio data, and thus promises to + reduce error propagation and misunderstandings in end-user applications. However, publicly available SLU resources are limited. + In this paper, we release SLURP, a new SLU package containing the following: (1) A new challenging dataset in English spanning + 18 domains, which is substantially bigger and linguistically more diverse than existing datasets; (2) Competitive baselines + based on state-of-the-art NLU and ASR systems; (3) A new transparent metric for entity labelling which enables a detailed error + analysis for identifying potential areas of improvement. SLURP is available at https://github.com/pswietojanski/slurp." +} +""" +_DATASETNAME = "massive" +_DESCRIPTION = """\ +MASSIVE dataset—Multilingual Amazon Slu resource package (SLURP) for Slot-filling, Intent classification, and +Virtual assistant Evaluation. MASSIVE contains 1M realistic, parallel, labeled virtual assistant utterances +spanning 18 domains, 60 intents, and 55 slots. MASSIVE was created by tasking professional translators to +localize the English-only SLURP dataset into 50 typologically diverse languages, including 8 native languages +and 2 other languages mostly spoken in Southeast Asia. +""" +_HOMEPAGE = "https://github.com/alexa/massive" +_LICENSE = Licenses.CC_BY_4_0.value +_LOCAL = False +_LANGUAGES = ["ind", "jav", "khm", "zlm", "mya", "tha", "tgl", "vie"] + +_URLS = { + _DATASETNAME: "https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.1.tar.gz", +} +_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION, Tasks.SLOT_FILLING] +_SOURCE_VERSION = "1.1.0" +_SEACROWD_VERSION = "1.0.0" + +# ind, jav, khm, zlm, mya, tha, tgl, vie, cmn, tam +_LANGS = [ + "af-ZA", + "am-ET", + "ar-SA", + "az-AZ", + "bn-BD", + "cy-GB", + "da-DK", + "de-DE", + "el-GR", + "en-US", + "es-ES", + "fa-IR", + "fi-FI", + "fr-FR", + "he-IL", + "hi-IN", + "hu-HU", + "hy-AM", + "id-ID", # ind + "is-IS", + "it-IT", + "ja-JP", + "jv-ID", # jav + "ka-GE", + "km-KH", # khm + "kn-IN", + "ko-KR", + "lv-LV", + "ml-IN", + "mn-MN", + "ms-MY", # zlm + "my-MM", # mya + "nb-NO", + "nl-NL", + "pl-PL", + "pt-PT", + "ro-RO", + "ru-RU", + "sl-SL", + "sq-AL", + "sv-SE", + "sw-KE", + "ta-IN", + "te-IN", + "th-TH", # tha + "tl-PH", # tgl + "tr-TR", + "ur-PK", + "vi-VN", # vie + "zh-CN", # cmn + "zh-TW", +] +_SUBSETS = ["id-ID", "jv-ID", "km-KH", "ms-MY", "my-MM", "th-TH", "tl-PH", "vi-VN"] + +_SCENARIOS = ["calendar", "recommendation", "social", "general", "news", "cooking", "iot", "email", "weather", "alarm", "transport", "lists", "takeaway", "play", "audio", "music", "qa", "datetime"] + +_INTENTS = [ + "audio_volume_other", + "play_music", + "iot_hue_lighton", + "general_greet", + "calendar_set", + "audio_volume_down", + "social_query", + "audio_volume_mute", + "iot_wemo_on", + "iot_hue_lightup", + "audio_volume_up", + "iot_coffee", + "takeaway_query", + "qa_maths", + "play_game", + "cooking_query", + "iot_hue_lightdim", + "iot_wemo_off", + "music_settings", + "weather_query", + "news_query", + "alarm_remove", + "social_post", + "recommendation_events", + "transport_taxi", + "takeaway_order", + "music_query", + "calendar_query", + "lists_query", + "qa_currency", + "recommendation_movies", + "general_joke", + "recommendation_locations", + "email_querycontact", + "lists_remove", + "play_audiobook", + "email_addcontact", + "lists_createoradd", + "play_radio", + "qa_stock", + "alarm_query", + "email_sendemail", + "general_quirky", + "music_likeness", + "cooking_recipe", + "email_query", + "datetime_query", + "transport_traffic", + "play_podcasts", + "iot_hue_lightchange", + "calendar_remove", + "transport_query", + "transport_ticket", + "qa_factoid", + "iot_cleaning", + "alarm_set", + "datetime_convert", + "iot_hue_lightoff", + "qa_definition", + "music_dislikeness", +] + +_TAGS = [ + "O", + "B-food_type", + "B-movie_type", + "B-person", + "B-change_amount", + "I-relation", + "I-game_name", + "B-date", + "B-movie_name", + "I-person", + "I-place_name", + "I-podcast_descriptor", + "I-audiobook_name", + "B-email_folder", + "B-coffee_type", + "B-app_name", + "I-time", + "I-coffee_type", + "B-transport_agency", + "B-podcast_descriptor", + "I-playlist_name", + "B-media_type", + "B-song_name", + "I-music_descriptor", + "I-song_name", + "B-event_name", + "I-timeofday", + "B-alarm_type", + "B-cooking_type", + "I-business_name", + "I-color_type", + "B-podcast_name", + "I-personal_info", + "B-weather_descriptor", + "I-list_name", + "B-transport_descriptor", + "I-game_type", + "I-date", + "B-place_name", + "B-color_type", + "B-game_name", + "I-artist_name", + "I-drink_type", + "B-business_name", + "B-timeofday", + "B-sport_type", + "I-player_setting", + "I-transport_agency", + "B-game_type", + "B-player_setting", + "I-music_album", + "I-event_name", + "I-general_frequency", + "I-podcast_name", + "I-cooking_type", + "I-radio_name", + "I-joke_type", + "I-meal_type", + "I-transport_type", + "B-joke_type", + "B-time", + "B-order_type", + "B-business_type", + "B-general_frequency", + "I-food_type", + "I-time_zone", + "B-currency_name", + "B-time_zone", + "B-ingredient", + "B-house_place", + "B-audiobook_name", + "I-ingredient", + "I-media_type", + "I-news_topic", + "B-music_genre", + "I-definition_word", + "B-list_name", + "B-playlist_name", + "B-email_address", + "I-currency_name", + "I-movie_name", + "I-device_type", + "I-weather_descriptor", + "B-audiobook_author", + "I-audiobook_author", + "I-app_name", + "I-order_type", + "I-transport_name", + "B-radio_name", + "I-business_type", + "B-definition_word", + "B-artist_name", + "I-movie_type", + "B-transport_name", + "I-email_folder", + "B-music_album", + "I-house_place", + "I-music_genre", + "B-drink_type", + "I-alarm_type", + "B-music_descriptor", + "B-news_topic", + "B-meal_type", + "I-transport_descriptor", + "I-email_address", + "I-change_amount", + "B-device_type", + "B-transport_type", + "B-relation", + "I-sport_type", + "B-personal_info", +] + + +class MASSIVEDataset(datasets.GeneratorBasedBuilder): + """MASSIVE datasets contains datasets to detect the intent from the text and fill the dialogue slots""" + + BUILDER_CONFIGS = ( + [ + SEACrowdConfig( + name=f"massive_{subset}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"MASSIVE source schema for {subset}", + schema="source", + subset_id="massive_" + subset, + ) + for subset in _SUBSETS + ] + + [ + SEACrowdConfig( + name=f"massive_{subset}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"MASSIVE Nusantara intent classification schema for {subset}", + schema="seacrowd_text", + subset_id="massive_intent_" + subset, + ) + for subset in _SUBSETS + ] + + [ + SEACrowdConfig( + name=f"massive_{subset}_seacrowd_seq_label", + version=datasets.Version(_SEACROWD_VERSION), + description=f"MASSIVE Nusantara slot filling schema for {subset}", + schema="seacrowd_seq_label", + subset_id="massive_slot_filling_" + subset, + ) + for subset in _SUBSETS + ] + + [ + SEACrowdConfig( + name="massive_source", + version=datasets.Version(_SOURCE_VERSION), + description="MASSIVE source schema", + schema="source", + subset_id="massive", + ), + SEACrowdConfig( + name="massive_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description="MASSIVE Nusantara intent classification schema", + schema="seacrowd_text", + subset_id="massive_intent", + ), + SEACrowdConfig( + name="massive_seacrowd_seq_label", + version=datasets.Version(_SEACROWD_VERSION), + description="MASSIVE Nusantara slot filling schema", + schema="seacrowd_seq_label", + subset_id="massive_slot_filling", + ), + ] + ) + + DEFAULT_CONFIG_NAME = "massive_id-ID_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "locale": datasets.Value("string"), + "partition": datasets.Value("string"), + "scenario": datasets.features.ClassLabel(names=_SCENARIOS), + "intent": datasets.features.ClassLabel(names=_INTENTS), + "utt": datasets.Value("string"), + "annot_utt": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "ner_tags": datasets.Sequence(datasets.features.ClassLabel(names=_TAGS)), + "worker_id": datasets.Value("string"), + "slot_method": datasets.Sequence( + { + "slot": datasets.Value("string"), + "method": datasets.Value("string"), + } + ), + "judgments": datasets.Sequence( + { + "worker_id": datasets.Value("string"), + "intent_score": datasets.Value("int8"), # [0, 1, 2] + "slots_score": datasets.Value("int8"), # [0, 1, 2] + "grammar_score": datasets.Value("int8"), # [0, 1, 2, 3, 4] + "spelling_score": datasets.Value("int8"), # [0, 1, 2] + "language_identification": datasets.Value("string"), + } + ), + } + ) + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(label_names=_INTENTS) + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(label_names=_TAGS) + else: + raise ValueError(f"Invalid config schema: {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + archive = dl_manager.download(_URLS[_DATASETNAME]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "files": dl_manager.iter_archive(archive), + "split": "train", + "lang": self.config.name, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "files": dl_manager.iter_archive(archive), + "split": "dev", + "lang": self.config.name, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "files": dl_manager.iter_archive(archive), + "split": "test", + "lang": self.config.name, + }, + ), + ] + + def _get_bio_format(self, text): + """This function is modified from https://huggingface.co/datasets/qanastek/MASSIVE/blob/main/MASSIVE.py""" + tags, tokens = [], [] + + bio_mode = False + cpt_bio = 0 + current_tag = None + + split_iter = iter(text.split(" ")) + + for s in split_iter: + if s.startswith("["): + current_tag = s.strip("[") + bio_mode = True + cpt_bio += 1 + next(split_iter) + continue + + elif s.endswith("]"): + bio_mode = False + if cpt_bio == 1: + prefix = "B-" + else: + prefix = "I-" + token = prefix + current_tag + word = s.strip("]") + current_tag = None + cpt_bio = 0 + + else: + if bio_mode: + if cpt_bio == 1: + prefix = "B-" + else: + prefix = "I-" + token = prefix + current_tag + word = s + cpt_bio += 1 + else: + token = "O" + word = s + + tags.append(token) + tokens.append(word) + + return tokens, tags + + def _generate_examples(self, files: list, split: str, lang: str): + _id = 0 + + lang = lang.replace("massive_", "").replace("source", "").replace("seacrowd_text", "").replace("seacrowd_seq_label", "") + + if not lang: + lang = _LANGS.copy() + else: + lang = [lang[:-1]] + + # logger.info("Generating examples from = %s", ", ".join(lang)) + + for path, f in files: + curr_lang = path.split(f"{_SOURCE_VERSION[:-2]}/data/")[-1].split(".jsonl")[0] + + if not lang: + break + elif curr_lang in lang: + lang.remove(curr_lang) + else: + continue + + # Read the file + lines = f.read().decode(encoding="utf-8").split("\n") + + for line in lines: + data = json.loads(line) + + if data["partition"] != split: + continue + + # Slot method + if "slot_method" in data: + slot_method = [ + { + "slot": s["slot"], + "method": s["method"], + } + for s in data["slot_method"] + ] + else: + slot_method = [] + + # Judgments + if "judgments" in data: + judgments = [ + { + "worker_id": j["worker_id"], + "intent_score": j["intent_score"], + "slots_score": j["slots_score"], + "grammar_score": j["grammar_score"], + "spelling_score": j["spelling_score"], + "language_identification": j["language_identification"] if "language_identification" in j else "target", + } + for j in data["judgments"] + ] + else: + judgments = [] + + if self.config.schema == "source": + tokens, tags = self._get_bio_format(data["annot_utt"]) + + yield _id, { + "id": str(_id) + "_" + data["id"], + "locale": data["locale"], + "partition": data["partition"], + "scenario": data["scenario"], + "intent": data["intent"], + "utt": data["utt"], + "annot_utt": data["annot_utt"], + "tokens": tokens, + "ner_tags": tags, + "worker_id": data["worker_id"], + "slot_method": slot_method, + "judgments": judgments, + } + + elif self.config.schema == "seacrowd_seq_label": + tokens, tags = self._get_bio_format(data["annot_utt"]) + + yield _id, { + "id": str(_id) + "_" + data["id"], + "tokens": tokens, + "labels": tags, + } + + elif self.config.schema == "seacrowd_text": + yield _id, { + "id": str(_id) + "_" + data["id"], + "text": data["utt"], + "label": data["intent"], + } + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + _id += 1 diff --git a/seacrowd/sea_datasets/melayu_brunei/__init__.py b/seacrowd/sea_datasets/melayu_brunei/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/melayu_brunei/melayu_brunei.py b/seacrowd/sea_datasets/melayu_brunei/melayu_brunei.py new file mode 100644 index 000000000..d241d172a --- /dev/null +++ b/seacrowd/sea_datasets/melayu_brunei/melayu_brunei.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils import schemas +from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA + + +_CITATION = """\ +@article{shiohara2021two, + title={Two Brunei Malay Texts: A Story of the Maiden Stem and Two Episodes in the History of Weston and Bukau}, + author={Shiohara, Asako and Fitri, Mohd Izzuddin}, + journal={アジア・アフリカの言語と言語学 (Asian and African languages and linguistics)}, + number={15}, + pages={171--190}, + year={2021}, + publisher={アジア・アフリカ言語文化研究所} +} +""" + + +_DATASETNAME = "melayu_brunei" + +_DESCRIPTION = """\ +This article gives two texts of Brunei Malay (ISO 639-3: kxd) collected in the town of Weston in Sabah State of Malaysia. +The texts exhibit linguistic features that are similar to those of Brunei Malay spoken in Brunei Darussalam; +it has a vowel inventory of only three vowels /a, i, u/, use of the pronoun kitani for the first person plural inclusive +and the use of the base-stem transitive form in patientive voice clauses. One of the texts tells a folk story about +Batang Dayang and other text includes two episodes: Javanese runaways arriving in Weston and the origin of the name +Bukau, a town near Weston. +""" + +_HOMEPAGE = "https://github.com/matbahasa/Melayu_Brunei" + +_LANGUAGES = ['kxd'] + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = { + 'Folklor2-1-01': 'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-1-01.txt', + 'Folklor2-1-02':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-1-02.txt', + 'Folklor2-1-03':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-1-03.txt', + 'Folklor2-1-04':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-1-04.txt', + 'Folklor2-1-05':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-1-05.txt', + 'Folklor2-1-06':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-1-06.txt', + 'Folklor2-1-07':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-1-07.txt', + 'Folklor2-1-08':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-1-08.txt', + 'Folklor2-2-01':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-01.txt', + 'Folklor2-2-02':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-02.txt', + 'Folklor2-2-03':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-03.txt', + 'Folklor2-2-04':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-04.txt', + 'Folklor2-2-05':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-05.txt', + 'Folklor2-2-06':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-06.txt', + 'Folklor2-2-07':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-07.txt', + 'Folklor2-2-08':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-08.txt', + 'Folklor2-2-09':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-09.txt', + 'Folklor2-2-10':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-10.txt', + 'Folklor2-2-11':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-11.txt', + 'Folklor2-2-12':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-12.txt', + 'Folklor2-2-13':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-13.txt', + 'Folklor2-2-14':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-14.txt', + 'Folklor2-2-15':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-15.txt', + 'Folklor2-2-16':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-2-16.txt', + 'Folklor2-3-01':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-3-01.txt', + 'Folklor2-3-02':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-3-02.txt', + 'Folklor2-3-03':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-3-03.txt', + 'Folklor2-3-06':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-3-06.txt', + 'Folklor2-3-07':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-3-07.txt', + 'Folklor2-3-08':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-3-08.txt', + 'Folklor2-3-09':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-3-09.txt', + 'Folklor2-3-10':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-3-10.txt', + 'Folklor2-4-00':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-00.txt', + 'Folklor2-4-01':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-01.txt', + 'Folklor2-4-02':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-02.txt', + 'Folklor2-4-03':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-03.txt', + 'Folklor2-4-04':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-04.txt', + 'Folklor2-4-05':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-05.txt', + 'Folklor2-4-06':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-06.txt', + 'Folklor2-4-07':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-07.txt', + 'Folklor2-4-08':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-08.txt', + 'Folklor2-4-09':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-4-09.txt', + 'Folklor2-5-01':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-5-01.txt', + 'Folklor2-5-02':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-5-02.txt', + 'Folklor2-5-03':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor2-5-03.txt', + 'Folklor3-0-01':'https://raw.githubusercontent.com/matbahasa/Melayu_Brunei/master/Folklor/Folklor3-0-01.txt', +} + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +class MelayuBruneiDataset(datasets.GeneratorBasedBuilder): + """This article gives two texts of Brunei Malay (ISO 639-3: kxd) collected in the town of + Weston in Sabah State of Malaysia. The texts exhibit linguistic features that are similar to those of + Brunei Malay spoken in Brunei Darussalam; it has a vowel inventory of only three vowels /a, i, u/, + use of the pronoun kitani for the first person plural inclusive and the use of the base-stem transitive + form in patientive voice clauses. One of the texts tells a folk story about Batang Dayang and other text + includes two episodes: Javanese runaways arriving in Weston and the origin of the name Bukau, a town near Weston. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.self_supervised_pretraining.features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = [_URLS[key] for key in _URLS.keys()] + data_path = dl_manager.download_and_extract(urls) + + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path[0], + "other_path": data_path[1:] + }, + ), + ] + + def _generate_examples(self, filepath: Path, other_path: List) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + filepaths = [filepath] + other_path + data = [] + for filepath in filepaths: + with open(filepath, "r") as f: + data.append(" ".join([line.rstrip() for line in f.readlines()])) + + for id, text in enumerate(data): + yield id, {"id": id, "text": text} \ No newline at end of file diff --git a/seacrowd/sea_datasets/melayu_sabah/__init__.py b/seacrowd/sea_datasets/melayu_sabah/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/melayu_sabah/melayu_sabah.py b/seacrowd/sea_datasets/melayu_sabah/melayu_sabah.py new file mode 100644 index 000000000..f3b6520b8 --- /dev/null +++ b/seacrowd/sea_datasets/melayu_sabah/melayu_sabah.py @@ -0,0 +1,148 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_DATASETNAME = "melayu_sabah" + + +_DESCRIPTION = """\ +Korpus Variasi Bahasa Melayu: Sabah is a language corpus sourced from various folklores in Melayu Sabah dialect. +""" + +_CITATION = """\ +@misc{melayusabah, + author = {Hiroki Nomoto}, + title = {Melayu_Sabah}, + year = {2020}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\\url{https://github.com/matbahasa/Melayu_Sabah}}, + commit = {90a46c8268412ccc1f29cdcbbd47354474f12d50} +} +""" + +_HOMEPAGE = "https://github.com/matbahasa/Melayu_Sabah" + + +_LANGUAGES = ["msi"] + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = { + "sabah201701": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201701.txt", + "sabah201702": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201702.txt", + "sabah201901": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201901.txt", + "sabah201902": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201902.txt", + "sabah201903": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201903.txt", + "sabah201904": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201904.txt", + "sabah201905": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201905.txt", + "sabah201906": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201906.txt", + "sabah201907": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201907.txt", + "sabah201908": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201908.txt", + "sabah201909": "https://raw.githubusercontent.com/matbahasa/Melayu_Sabah/master/Sabah201909.txt", +} + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MelayuSabah(datasets.GeneratorBasedBuilder): + """Korpus Variasi Bahasa Melayu: + Sabah is a language corpus sourced from various folklores in Melayu Sabah dialect.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_ssp": + features = schemas.self_supervised_pretraining.features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = [_URLS[key] for key in _URLS.keys()] + data_path = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_path[0], "split": "train", "other_path": data_path[1:]}, + ) + ] + + def _generate_examples(self, filepath: Path, split: str, other_path: List) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + filepaths = [filepath] + other_path + data = [] + for filepath in filepaths[:2]: + with open(filepath, "r") as f: + sentences = [line.rstrip() for line in f.readlines()] + sentences = [sentence.split("\t")[-1] for sentence in sentences] + data.append("\n".join(sentences)) + + for filepath in filepaths[2:]: + with open(filepath, "r") as f: + data.append([line.rstrip() for line in f.readlines()]) + + for id, text in enumerate(data): + yield id, {"id": id, "text": text} diff --git a/seacrowd/sea_datasets/melayu_sarawak/__init__.py b/seacrowd/sea_datasets/melayu_sarawak/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/melayu_sarawak/melayu_sarawak.py b/seacrowd/sea_datasets/melayu_sarawak/melayu_sarawak.py new file mode 100644 index 000000000..e6e98cc88 --- /dev/null +++ b/seacrowd/sea_datasets/melayu_sarawak/melayu_sarawak.py @@ -0,0 +1,146 @@ +# coding=utf-8 +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_DATASETNAME = "melayu_sarawak" + + +_DESCRIPTION = """\ +Korpus Variasi Bahasa Melayu: Sarawak is a language corpus sourced from various folklores in Melayu Sarawak dialect. +""" + + +_HOMEPAGE = "https://github.com/matbahasa/Melayu_Sarawak" + +_CITATION = """\ +@misc{melayusarawak, + author = {Hiroki Nomoto}, + title = {Melayu_Sabah}, + year = {2020}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\\url{https://github.com/matbahasa/Melayu_Sarawak}}, + commit = {a175f691f9db94d7b4f971e7a93b7cc001c0ed47} +} +""" + +_LANGUAGES = ["zlm"] + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = { + "sarawak201801": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201801.txt", + "sarawak201802": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201802.txt", + "sarawak201803": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201803.txt", + "sarawak201804": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201804.txt", + "sarawak201805": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201805.txt", + "sarawak201806": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201806.txt", + "sarawak201807": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201807.txt", + "sarawak201808": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201808.txt", + "sarawak201809": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201809.txt", + "sarawak201810": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201810.txt", + "sarawak201811": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201811.txt", + "sarawak201812": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201812.txt", + "sarawak201813": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201813.txt", + "sarawak201814": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201814.txt", + "sarawak201815": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201815.txt", + "sarawak201817": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201817.txt", + "sarawak201818": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201818.txt", + "sarawak201819": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201819.txt", + "sarawak201820": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201820.txt", + "sarawak201821": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201821.txt", + "sarawak201822": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201822.txt", + "sarawak201823": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201823.txt", + "sarawak201824": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201824.txt", + "sarawak201825": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201825.txt", + "sarawak201826": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201826.txt", + "sarawak201827": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201827.txt", + "sarawak201828": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201828.txt", + "sarawak201829": "https://raw.githubusercontent.com/matbahasa/Melayu_Sarawak/master/Sarawak201829.txt", +} + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MelayuSarawakDataset(datasets.GeneratorBasedBuilder): + """Korpus Variasi Bahasa Melayu: + Sarawak is a language corpus sourced from various folklores in Melayu Sarawak dialect.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_ssp": + features = schemas.self_supervised_pretraining.features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = [_URLS[key] for key in _URLS.keys()] + data_path = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_path[0], "split": "train", "other_path": data_path[1:]}, + ) + ] + + def _generate_examples(self, filepath: Path, split: str, other_path: List) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + filepaths = [filepath] + other_path + data = [] + for filepath in filepaths: + with open(filepath, "r") as f: + data.append([line.rstrip() for line in f.readlines()]) + + for id, text in enumerate(data): + yield id, {"id": id, "text": text} \ No newline at end of file diff --git a/seacrowd/sea_datasets/melayu_standard_lisan/__init__.py b/seacrowd/sea_datasets/melayu_standard_lisan/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/melayu_standard_lisan/melayu_standard_lisan.py b/seacrowd/sea_datasets/melayu_standard_lisan/melayu_standard_lisan.py new file mode 100644 index 000000000..0bc21a657 --- /dev/null +++ b/seacrowd/sea_datasets/melayu_standard_lisan/melayu_standard_lisan.py @@ -0,0 +1,162 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = """\ +@misc{nomoto2018melayustandardlisan, + author = {Hiroki Nomoto}, + title = {Korpus Variasi Bahasa Melayu: Standard Lisan}, + year = {2018}, + url = {https://github.com/matbahasa/Melayu_Standard_Lisan} +} +""" + +_DATASETNAME = "melayu_standard_lisan" + + +_DESCRIPTION = """\ +Korpus Variasi Bahasa Melayu: Standard Lisan is a language corpus sourced from monologues of various melayu folklores. +""" + + +_HOMEPAGE = "https://github.com/matbahasa/Melayu_Standard_Lisan" + + +_LANGUAGES = ["zlm"] + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = { + "kl201701": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201701.txt", + "kl201702": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201702.txt", + "kl201703": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201703.txt", + "kl201704": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201704.txt", + "kl201705": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201705.txt", + "kl201706": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201706.txt", + "kl201707": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201707.txt", + "kl201708": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201708.txt", + "kl201709": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201709.txt", + "kl201710": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201710.txt", + "kl201711": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201711.txt", + "kl201712": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201712.txt", + "kl201713": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201713.txt", + "kl201714": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201714.txt", + "kl201715": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201715.txt", + "kl201716": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201716.txt", + "kl201717": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201717.txt", + "kl201718": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201718.txt", + "kl201719": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201719.txt", + "kl201720": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201720.txt", + "kl201721": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201721.txt", + "kl201722": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201722.txt", + "kl201723": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201723.txt", + "kl201724": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201724.txt", + "kl201725": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201725.txt", + "kl201726": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201726.txt", + "kl201727": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201727.txt", + "kl201728": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201728.txt", + "kl201729": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201729.txt", + "kl201730": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201730.txt", + "kl201731": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201731.txt", + "kl201732": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201732.txt", + "kl201733": "https://raw.githubusercontent.com/matbahasa/Melayu_Standard_Lisan/master/KL201733.txt", +} + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MelayuStandardLisan(datasets.GeneratorBasedBuilder): + """Korpus Variasi Bahasa Melayu: + Standard Lisan is a language corpus sourced from monologues of various melayu folklores.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.self_supervised_pretraining.features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = [_URLS[key] for key in _URLS.keys()] + data_path = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_path[0], "split": "train", "other_path": data_path[1:]}, + ) + ] + + def _generate_examples(self, filepath: Path, split: str, other_path: List) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + filepaths = [filepath] + other_path + data = [] + for filepath in filepaths: + with open(filepath, "r") as f: + data.append(" ".join([line.rstrip() for line in f.readlines()])) + + for id, text in enumerate(data): + yield id, {"id": id, "text": text} diff --git a/seacrowd/sea_datasets/memolon/__init__.py b/seacrowd/sea_datasets/memolon/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/memolon/memolon.py b/seacrowd/sea_datasets/memolon/memolon.py new file mode 100644 index 000000000..5b3c6b97c --- /dev/null +++ b/seacrowd/sea_datasets/memolon/memolon.py @@ -0,0 +1,142 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{buechel-etal-2020-learning-evaluating, + title = "Learning and Evaluating Emotion Lexicons for 91 Languages", + author = {Buechel, Sven and + R{\"u}cker, Susanna and + Hahn, Udo}, + editor = "Jurafsky, Dan and + Chai, Joyce and + Schluter, Natalie and + Tetreault, Joel", + booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", + month = jul, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.acl-main.112", + doi = "10.18653/v1/2020.acl-main.112", + pages = "1202--1217", +} +""" + +_DATASETNAME = "memolon" + +_DESCRIPTION = """\ +MEmoLon is an emotion lexicons for 91 languages, each one covers eight emotional variables and comprises over 100k word entries. There are several versions of the lexicons, the difference being the choice of the expansion model. +""" + +_HOMEPAGE = "https://zenodo.org/record/3756607/files/MTL_grouped.zip?download=1" + +_LICENSE = Licenses.MIT.value + +_URLS = { + _DATASETNAME: "https://zenodo.org/record/3756607/files/MTL_grouped.zip?download=1", +} + +_SOURCE_VERSION = "1.0.0" + +_LANGUAGES = ["ceb", "tgl", "ind", "sun", "jav", "zsm", "vie", "tha", "mya"] + +_LANGUAGE_MAP = {"ceb": "Cebuano", "tgl": "Tagalog", "ind": "Indonesian", "sun": "Sundanese", "jav": "Javanese", "zsm": "Malay", "vie": "Vietnamese", "tha": "Thai", "mya": "Burmese"} + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + + +def seacrowd_config_constructor(lang: str, schema: str, version: str) -> SEACrowdConfig: + if lang not in _LANGUAGE_MAP: + raise ValueError(f"Invalid lang {lang}") + + if schema != "source" and schema != "seacrowd_text_multi": + raise ValueError(f"Invalid schema: {schema}") + + return SEACrowdConfig( + name="memolon_{lang}_{schema}".format(lang=lang, schema=schema), + version=datasets.Version(version), + description="MEmoLon {schema} schema for {lang} language".format(lang=_LANGUAGE_MAP[lang], schema=schema), + schema=schema, + subset_id="memolon", + ) + + +class Memolon(datasets.GeneratorBasedBuilder): + """MEmoLon is an emotion lexicons for 91 languages, each one covers eight emotional variables and comprises over 100k word entries.""" + + BUILDER_CONFIGS = [SEACrowdConfig(name=f"{_DATASETNAME}_{lang}_source", version=datasets.Version(_SOURCE_VERSION), description=f"MEmoLon source schema for {lang} language", schema="source", subset_id="memolon") for lang in _LANGUAGE_MAP] + + DEFAULT_CONFIG_NAME = None + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "word": datasets.Value("string"), + "valence": datasets.Value("float32"), + "arousal": datasets.Value("float32"), + "dominance": datasets.Value("float32"), + "joy": datasets.Value("float32"), + "anger": datasets.Value("float32"), + "sadness": datasets.Value("float32"), + "fear": datasets.Value("float32"), + "disgust": datasets.Value("float32"), + } + ) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + base_path = Path(dl_manager.download_and_extract(urls)) + lang = self.config.name.split("_")[1] + train_data_path = base_path / f"{lang}.tsv" + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_data_path, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + rows = [] + with open(filepath, encoding='utf-8') as file: + for line in file: + rows.append(line.split("\t")) + + if self.config.schema == "source": + for key, row in enumerate(rows[1:]): + example = {"word": row[0], "valence": row[1], "arousal": row[2], "dominance": row[3], "joy": row[4], "anger": row[5], "sadness": row[6], "fear": row[7], "disgust": row[8]} + yield key, example diff --git a/seacrowd/sea_datasets/miracl/__init__.py b/seacrowd/sea_datasets/miracl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/miracl/miracl.py b/seacrowd/sea_datasets/miracl/miracl.py new file mode 100644 index 000000000..0320cfd67 --- /dev/null +++ b/seacrowd/sea_datasets/miracl/miracl.py @@ -0,0 +1,288 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that +collectively encompass over three billion native speakers around the world. +This resource is designed to support monolingual retrieval tasks, where the +queries and the corpora are in the same language. In total, we have gathered +over 726k high-quality relevance judgments for 78k queries over Wikipedia in +these languages, where all annotations have been performed by native speakers. +MIRACL covers Indonesian and Thai languages +""" + +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +from collections import defaultdict + +_CITATION = """\ +@article{10.1162/tacl_a_00595, + author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, + title = "{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}", + journal = {Transactions of the Association for Computational Linguistics}, + volume = {11}, + pages = {1114-1131}, + year = {2023}, + month = {09}, + abstract = "{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}", + issn = {2307-387X}, + doi = {10.1162/tacl_a_00595}, + url = {https://doi.org/10.1162/tacl\_a\_00595}, + eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}, +} +""" + + +_DATASETNAME = "miracl" + +_DESCRIPTION = """\ +MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers. MIRACL covers Indonesian and Thai languages. Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/miracl/miracl and use huggingface-cli login for authentication. +""" + +_HOMEPAGE = "https://project-miracl.github.io/" + +_LANGUAGES = ["ind", "tha"] + +_LICENSE = Licenses.APACHE_2_0.value + +_LANGUAGE_MAP = { + "id": "Thai", + "th": "Indonesian" +} + +_URLS = {_DATASETNAME: {lang: {} for lang in _LANGUAGE_MAP}} + +for lang in _LANGUAGE_MAP: + _URLS[_DATASETNAME][lang]['train'] = [ + f'https://huggingface.co/datasets/miracl/miracl/resolve/main/miracl-v1.0-{lang}/topics/topics.miracl-v1.0-{lang}-train.tsv', + f'https://huggingface.co/datasets/miracl/miracl/resolve/main/miracl-v1.0-{lang}/qrels/qrels.miracl-v1.0-{lang}-train.tsv', + ] + + _URLS[_DATASETNAME][lang]['dev'] = [ + f'https://huggingface.co/datasets/miracl/miracl/resolve/main/miracl-v1.0-{lang}/topics/topics.miracl-v1.0-{lang}-dev.tsv', + f'https://huggingface.co/datasets/miracl/miracl/resolve/main/miracl-v1.0-{lang}/qrels/qrels.miracl-v1.0-{lang}-dev.tsv', + ] + + _URLS[_DATASETNAME][lang]['testB'] =[ + f'https://huggingface.co/datasets/miracl/miracl/resolve/main/miracl-v1.0-{lang}/topics/topics.miracl-v1.0-{lang}-test-b.tsv', + ] + + _URLS[_DATASETNAME][lang]['testA'] = [ + f'https://huggingface.co/datasets/miracl/miracl/resolve/main/miracl-v1.0-{lang}/topics/topics.miracl-v1.0-{lang}-test-a.tsv', + ] + + +_SUPPORTED_TASKS = [Tasks.TEXT_RETRIEVAL] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +def load_topic(fn): + qid2topic = {} + with open(fn, encoding="utf-8") as f: + for line in f: + qid, topic = line.strip().split('\t') + qid2topic[qid] = topic + return qid2topic + + +def load_qrels(fn): + if fn is None: + return None + + qrels = defaultdict(dict) + with open(fn, encoding="utf-8") as f: + for line in f: + qid, _, docid, rel = line.strip().split('\t') + qrels[qid][docid] = int(rel) + return qrels + +def seacrowd_config_constructor(lang, schema, version): + if lang not in _LANGUAGE_MAP: + raise ValueError(f"Invalid lang {lang}") + + if schema != "source" and schema != "seacrowd_pairs": + raise ValueError(f"Invalid schema: {schema}") + + return SEACrowdConfig( + name="miracl_{lang}_{schema}".format(lang=lang, schema=schema), + version=datasets.Version(version), + description="MIRACL {schema} schema for {lang} language".format(lang=_LANGUAGE_MAP[lang], schema=schema), + schema=schema, + subset_id="miracl_{lang}".format(lang=lang), + ) + +class Miracl(datasets.GeneratorBasedBuilder): + """MIRACL is a multilingual retrieval dataset that focuses on search across 18 different languages.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [seacrowd_config_constructor(lang, "source", _SOURCE_VERSION) for lang in _LANGUAGE_MAP] + [seacrowd_config_constructor(lang, "seacrowd_pairs", _SEACROWD_VERSION) for lang in _LANGUAGE_MAP] + + DEFAULT_CONFIG_NAME = None + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({ + 'query_id': datasets.Value('string'), + 'query': datasets.Value('string'), + + 'positive_passages': [{ + 'docid': datasets.Value('string'), + 'text': datasets.Value('string'), 'title': datasets.Value('string') + }], + 'negative_passages': [{ + 'docid': datasets.Value('string'), + 'text': datasets.Value('string'), 'title': datasets.Value('string'), + }], + }) + elif self.config.schema == "seacrowd_pairs": + features = schemas.pairs_features(["pos", "neg", "none"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + lang = self.config.name.split("_")[1] + downloaded_files = dl_manager.download_and_extract(urls[lang]) + + return [ + datasets.SplitGenerator( + name="train", + gen_kwargs={ + "filepaths": downloaded_files["train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name="dev", + gen_kwargs={ + "filepaths": downloaded_files["dev"], + "split": "dev", + }, + ), + datasets.SplitGenerator( + name="testA", + gen_kwargs={ + "filepaths": downloaded_files["testA"], + "split": "testA", + }, + ), + datasets.SplitGenerator( + name="testB", + gen_kwargs={ + "filepaths": downloaded_files["testB"], + "split": "testB", + }, + ) + ] + + + def _generate_examples(self, filepaths: List[str], split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + lang = self.config.name.split("_")[1] + + # the following code except for seacrowd_pairs is taken from the original MIRACL + # dataloader implementation + # https://huggingface.co/datasets/miracl/miracl + miracl_corpus = datasets.load_dataset('miracl/miracl-corpus', lang)['train'] + docid2doc = {doc['docid']: (doc['title'], doc['text']) for doc in miracl_corpus} + topic_fn, qrel_fn = (filepaths) if len(filepaths) == 2 else (filepaths[0], None) + qid2topic = load_topic(topic_fn) + qrels = load_qrels(qrel_fn) + + if self.config.schema == "source": + for qid in qid2topic: + data = {} + data['query_id'] = qid + data['query'] = qid2topic[qid] + + pos_docids = [docid for docid, rel in qrels[qid].items() if rel == 1] if qrels is not None else [] + neg_docids = [docid for docid, rel in qrels[qid].items() if rel == 0] if qrels is not None else [] + + data['positive_passages'] = [{ + 'docid': docid, + **dict(zip(['title', 'text'], docid2doc[docid])) + } for docid in pos_docids if docid in docid2doc] + + data['negative_passages'] = [{ + 'docid': docid, + **dict(zip(['title', 'text'], docid2doc[docid])) + } for docid in neg_docids if docid in docid2doc] + + yield qid, data + + elif self.config.schema == "seacrowd_pairs": + id = -1 + for qid in qid2topic: + pos_docids = [docid for docid, rel in qrels[qid].items() if rel == 1] if qrels is not None else [] + neg_docids = [docid for docid, rel in qrels[qid].items() if rel == 0] if qrels is not None else [] + + positive_passages = [{ + 'docid': docid, + **dict(zip(['title', 'text'], docid2doc[docid])) + } for docid in pos_docids if docid in docid2doc] + + negative_passages = [{ + 'docid': docid, + **dict(zip(['title', 'text'], docid2doc[docid])) + } for docid in neg_docids if docid in docid2doc] + + # assemble data + data = {} + data['text_1'] = qid2topic[qid] # query + + if split in ["testA", "testB"]: # test sets only contains id and query + id += 1 + data['id'] = id + data['text_2'] = "" + data['label'] = "none" + + yield id, data + else: + # generate positive pairs + for positive_doc in positive_passages: + id += 1 + data['id'] = id + # flatten dict contents to String by concatenating title and text separated by double newline + data['text_2'] = positive_doc['title'] + "\n\n" + positive_doc["text"] + data['label'] = "pos" + yield id, data + + # generate negative pairs + for negative_doc in negative_passages: + id += 1 + data['id'] = id + # flatten dict contents to String by concatenating title and text separated by double newline + data['text_2'] = negative_doc['title'] + "\n\n" + negative_doc["text"] + data['label'] = "neg" + yield id, data diff --git a/seacrowd/sea_datasets/mkqa/__init__.py b/seacrowd/sea_datasets/mkqa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/mkqa/mkqa.py b/seacrowd/sea_datasets/mkqa/mkqa.py new file mode 100644 index 000000000..71dc24781 --- /dev/null +++ b/seacrowd/sea_datasets/mkqa/mkqa.py @@ -0,0 +1,227 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{longpre-etal-2021-mkqa, + title = "{MKQA}: A Linguistically Diverse Benchmark for Multilingual Open Domain Question Answering", + author = "Longpre, Shayne and + Lu, Yi and + Daiber, Joachim", + editor = "Roark, Brian and + Nenkova, Ani", + journal = "Transactions of the Association for Computational Linguistics", + volume = "9", + year = "2021", + address = "Cambridge, MA", + publisher = "MIT Press", + url = "https://aclanthology.org/2021.tacl-1.82", + doi = "10.1162/tacl_a_00433", + pages = "1389--1406", +} +""" + +_DATASETNAME = "mkqa" + +_DESCRIPTION = """\ +Multilingual Knowledge Questions and Answers (MKQA), an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages (260k question-answer pairs in total) +""" + +_HOMEPAGE = "https://github.com/apple/ml-mkqa" + +_LICENSE = Licenses.CC_BY_SA_3_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/apple/ml-mkqa/raw/main/dataset/mkqa.jsonl.gz", +} + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_LANGUAGES = [ + "khm", + "zsm", + "tha", + "vie", +] # follows the convention of 3-letter code as suggested since NusaCrowd. + + +class MKQADataset(datasets.GeneratorBasedBuilder): + """ + MKQA, an open-domain question answering evaluation set comprising 10k question-answer pairs + aligned across 26 typologically diverse languages (260k question-answer pairs in total). + The goal of this dataset is to provide a challenging benchmark for question answering quality + across a wide set of languages. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + _ANS_TYPES = [ + "binary", + "date", + "entity", + "long_answer", + "number", + "number_with_unit", + "short_phrase", + "unanswerable", + ] + + _SOURCE_LANGUAGES = [ + "km", + "ms", + "th", + "vi", + # Filtered out: + # "ar", "da", "de", "en", "es", "fi", "fr", "he", "hu", "it", "ja", "ko", + # "nl", "no", "pl", "pt", "ru", "sv", "tr", "zh_cn", "zh_hk", "zh_tw", + ] + + _LANG_3TO2 = { + "khm": "km", + "zsm": "ms", + "tha": "th", + "vie": "vi", + } + + BUILDER_CONFIGS = [ + *[ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset_lang}{'_' if subset_lang else ''}source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{subset_lang}", + ) + for subset_lang in ["", *_LANGUAGES] + ], + *[ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset_lang}{'_' if subset_lang else ''}seacrowd_qa", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{_DATASETNAME}_{subset_lang}", + ) + for subset_lang in ["", *_LANGUAGES] + ], + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + lang = self.config.subset_id.rsplit("_", 1)[-1] + lang = self._LANG_3TO2.get(lang, lang) + + if self.config.schema == "source": + features = datasets.Features( + { + "query": datasets.Value("string"), + "answers": { + cur_lang: [ + { + "type": datasets.ClassLabel(names=self._ANS_TYPES), + "entity": datasets.Value("string"), + "text": datasets.Value("string"), + "aliases": [datasets.Value("string")], + } + ] + for cur_lang in ([lang] if lang else self._SOURCE_LANGUAGES) + }, + "queries": {cur_lang: datasets.Value("string") for cur_lang in ([lang] if lang else self._SOURCE_LANGUAGES)}, + "example_id": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + features["meta"]["answer_entity"] = datasets.Sequence(datasets.Value("string")) + features["meta"]["answer_aliases"] = datasets.Sequence(datasets.Sequence(datasets.Value("string"))) + features["meta"]["answer_type"] = datasets.Sequence(datasets.ClassLabel(names=self._ANS_TYPES)) + + else: # schema not found! should NOT reach here ... + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_path = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_path}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + lang = self.config.subset_id.rsplit("_", 1)[-1] + lang = self._LANG_3TO2.get(lang, lang) + + datas = [] + with open(filepath, "r", encoding="utf8") as ipt: + for cur in map(json.loads, ipt): + cur["example_id"] = str(cur["example_id"]) + for key in ["answers", "queries"]: + cur[key] = {k: v for k, v in cur[key].items() if k in ([lang] if lang else self._SOURCE_LANGUAGES)} + datas.append(cur) + + if self.config.schema == "source": + for cur in datas: + for anslist in cur["answers"].values(): + for ans in anslist: + ans.setdefault("entity", "") + ans.setdefault("aliases", []) + yield int(cur["example_id"]), cur + + elif self.config.schema == "seacrowd_qa": + for cur in datas: + for cur_lang in [lang] if lang else map(lambda k: self._LANG_3TO2.get(k, k), _LANGUAGES): + ret = { + "id": f'{cur["example_id"]}_{cur_lang}', + "question_id": cur["example_id"], + "document_id": "", + "question": cur["queries"][cur_lang], + "type": "open_domain", + "choices": [], + "context": "", + "answer": [ans.get("text", None) for ans in cur["answers"][cur_lang]], + "meta": {f"answer_{k}": [ans.get(k, None) for ans in cur["answers"][cur_lang]] for k in ["entity", "aliases", "type"]}, + } + ret["meta"]["answer_aliases"] = list(map(lambda a: [] if a is None else a, ret["meta"]["answer_aliases"])) + yield ret["id"], ret diff --git a/seacrowd/sea_datasets/mlqa/mlqa.py b/seacrowd/sea_datasets/mlqa/mlqa.py index a9b07717e..f2884e0f1 100644 --- a/seacrowd/sea_datasets/mlqa/mlqa.py +++ b/seacrowd/sea_datasets/mlqa/mlqa.py @@ -29,6 +29,7 @@ _HOMEPAGE = "https://github.com/facebookresearch/MLQA" _LICENSE = Licenses.CC_BY_SA_3_0.value +_LANGUAGES = ["vie"] _URL = "https://dl.fbaipublicfiles.com/MLQA/" _DEV_TEST_URL = "MLQA_V1.zip" _TRANSLATE_TEST_URL = "mlqa-translate-test.tar.gz" diff --git a/seacrowd/sea_datasets/mozilla_pontoon/__init__.py b/seacrowd/sea_datasets/mozilla_pontoon/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/mozilla_pontoon/mozilla_pontoon.py b/seacrowd/sea_datasets/mozilla_pontoon/mozilla_pontoon.py new file mode 100644 index 000000000..8ddde1421 --- /dev/null +++ b/seacrowd/sea_datasets/mozilla_pontoon/mozilla_pontoon.py @@ -0,0 +1,171 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# Keep blank; dataset has no associated paper +_CITATION = """\ +@article{, + author = {}, + title = {}, + journal = {}, + volume = {}, + year = {}, + url = {}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +_LOCAL = False +_LANGUAGES = ["mya", "ceb", "gor", "hil", "ilo", "ind", "jav", "khm", "lao", "zlm", "nia", "tgl", "tha", "vie"] + +_DATASETNAME = "mozilla_pontoon" +_DESCRIPTION = """ +This dataset contains crowdsource translations of more than 200 languages for +different Mozilla open-source projects from Mozilla's Pontoon localization platform. +Source sentences are in English. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/ayymen/Pontoon-Translations" +_LICENSE = Licenses.BSD_3_CLAUSE.value +_URL = "https://huggingface.co/datasets/ayymen/Pontoon-Translations" + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class MozillaPontoonDataset(datasets.GeneratorBasedBuilder): + """Dataset of translations from Mozilla's Pontoon platform.""" + + # Two-letter ISO code is used when available + # otherwise 3-letter one is used + LANG_CODE_MAPPER = {"mya": "my", "ceb": "ceb", "gor": "gor", "hil": "hil", "ilo": "ilo", "ind": "id", "jav": "jv", "khm": "km", "lao": "lo", "zlm": "ms", "nia": "nia", "tgl": "tl", "tha": "th", "vie": "vi"} + + # Config to load individual datasets per language + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_eng_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for {lang} language", + schema="source", + subset_id=f"{_DATASETNAME}_eng_{lang}", + ) + for lang in _LANGUAGES + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_eng_{lang}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for {lang} language", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_eng_{lang}", + ) + for lang in _LANGUAGES + ] + + # Config to load all datasets + BUILDER_CONFIGS.extend( + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for all languages", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for all languages", + schema="seacrowd_t2t", + subset_id=_DATASETNAME, + ), + ] + ) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "source_string": datasets.Value("string"), + "target_string": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # dl_manager not used since dataloader uses HF 'load_dataset' + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"split": "train"}, + ), + ] + + def _load_hf_data_from_remote(self, language: str) -> datasets.DatasetDict: + """Load dataset from HuggingFace.""" + hf_lang_code = self.LANG_CODE_MAPPER[language] + hf_remote_ref = "/".join(_URL.split("/")[-2:]) + return datasets.load_dataset(hf_remote_ref, f"en-{hf_lang_code}", split="train") + + def _generate_examples(self, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + languages = [] + pontoon_datasets = [] + + lang = self.config.subset_id.split("_")[-1] + if lang in _LANGUAGES: + languages.append(lang) + pontoon_datasets.append(self._load_hf_data_from_remote(lang)) + else: + for lang in _LANGUAGES: + languages.append(lang) + pontoon_datasets.append(self._load_hf_data_from_remote(lang)) + + index = 0 + for lang, lang_subset in zip(languages, pontoon_datasets): + for row in lang_subset: + if self.config.schema == "source": + example = row + + elif self.config.schema == "seacrowd_t2t": + example = { + "id": str(index), + "text_1": row["source_string"], + "text_2": row["target_string"], + "text_1_name": "eng", + "text_2_name": lang, + } + yield index, example + index += 1 diff --git a/seacrowd/sea_datasets/mswc/__init__.py b/seacrowd/sea_datasets/mswc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/mswc/mswc.py b/seacrowd/sea_datasets/mswc/mswc.py new file mode 100644 index 000000000..36c34cf94 --- /dev/null +++ b/seacrowd/sea_datasets/mswc/mswc.py @@ -0,0 +1,219 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = """\ +@inproceedings{mazumder2021mswc, + author = {Mazumder, Mark and Chitlangia, Sharad and Banbury, Colby and Kang, Yiping and Ciro, Juan and Achorn, Keith and Galvez, + Daniel and Sabini, Mark and Mattson, Peter and Kanter, David and Diamos, Greg and Warden, Pete and Meyer, Josh and Janapa Reddi, + Vijay}, + booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks}, + editor = {J. Vanschoren and S. Yeung}, + pages = {}, + publisher = {Curran}, + title = {Multilingual Spoken Words Corpus}, + url = {https://datasets-benchmarks-proceedings.neurips.cc/paper_files/paper/2021/file/fe131d7f5a6b38b23cc967316c13dae2-Paper-round2.pdf}, + volume = {1}, + year = {2021} +} +""" + +_DATASETNAME = "mswc" + +_DESCRIPTION = """\ +Multilingual Spoken Words Corpus is a large and growing audio dataset of spoken words in 50 languages collectively spoken by over 5 billion people, for academic research and commercial applications in keyword spotting and spoken term search. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/MLCommons/ml_spoken_words" + +_LANGUAGES = ["cnh", "ind", "vie"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LANGUAGE_NAME_MAP = { + "cnh": "cnh", + "ind": "id", + "vie": "vi", +} + +_FORMATS = ["wav", "opus"] + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = "https://huggingface.co/datasets/MLCommons/ml_spoken_words/resolve/refs%2Fconvert%2Fparquet/{lang}_{format}/{split}/0000.parquet?download=true" + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] +_SUPPORTED_SCHEMA_STRINGS = [f"seacrowd_{str(TASK_TO_SCHEMA[task]).lower()}" for task in _SUPPORTED_TASKS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MSWC(datasets.GeneratorBasedBuilder): + """ + Multilingual Spoken Words Corpus is a large and growing audio dataset of spoken words in 50 languages collectively spoken by over 5 billion people, for academic research and commercial applications in keyword spotting and spoken term search. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [] + + for language in _LANGUAGES: + for format in _FORMATS: + subset_id = f"{_DATASETNAME}_{language}_{format}" + BUILDER_CONFIGS.append( + SEACrowdConfig(name=f"{subset_id}_source", version=SOURCE_VERSION, description=f"{_DATASETNAME} source schema", schema="source", subset_id=subset_id), + ) + + seacrowd_schema_config: list[SEACrowdConfig] = [] + + for seacrowd_schema in _SUPPORTED_SCHEMA_STRINGS: + for language in _LANGUAGES: + for format in _FORMATS: + subset_id = f"{_DATASETNAME}_{language}_{format}" + seacrowd_schema_config.append( + SEACrowdConfig( + name=f"{subset_id}_{seacrowd_schema}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {seacrowd_schema} schema", + schema=f"{seacrowd_schema}", + subset_id=subset_id, + ) + ) + + BUILDER_CONFIGS.extend(seacrowd_schema_config) + + DEFAULT_CONFIG_NAME = f"{_LANGUAGES[0]}_{_FORMATS[0]}_source" + + def _info(self) -> datasets.DatasetInfo: + + _, _, format = str(self.config.subset_id).split("_") + + if self.config.schema == "source": + features = datasets.Features( + { + "file": datasets.Value("string"), + "is_valid": datasets.Value("bool"), + "language": datasets.ClassLabel(num_classes=3), + "speaker_id": datasets.Value("string"), + "gender": datasets.ClassLabel(num_classes=4), + "keyword": datasets.Value("string"), + "audio": datasets.Audio(decode=False, sampling_rate=16000 if format == "wav" else 48000), + } + ) + + elif self.config.schema == f"seacrowd_{str(TASK_TO_SCHEMA[Tasks.SPEECH_RECOGNITION]).lower()}": + features = schemas.speech_text_features + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + split_names = ["train", "validation", "test"] + + result = [] + + _, language, format = str(self.config.subset_id).split("_") + + for split_name in split_names: + path = dl_manager.download_and_extract(_URLS.format(split=split_name, lang=_LANGUAGE_NAME_MAP[language], format=format)) + + result.append( + datasets.SplitGenerator( + name=split_name, + gen_kwargs={ + "path": path, + "split": split_name, + "language": language, + "format": format, + }, + ), + ) + + return result + + def _generate_examples(self, path: Path, split: str, language: str, format: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + idx = 0 + + if self.config.schema == "source": + df = pd.read_parquet(path) + + for _, row in df.iterrows(): + yield idx, row.to_dict() + idx += 1 + + elif self.config.schema == f"seacrowd_{str(TASK_TO_SCHEMA[Tasks.SPEECH_RECOGNITION]).lower()}": + df = pd.read_parquet(path) + + base_folder = os.path.dirname(path) + base_folder = os.path.join(base_folder, _DATASETNAME, language, format, split) + + if not os.path.exists(base_folder): + os.makedirs(base_folder) + + audio_paths = [] + + for _, row in df.iterrows(): + audio_dict = row["audio"] + file_name = audio_dict["path"] + + path = os.path.join(base_folder, file_name) + + audio_dict["path"] = path + + with open(path, "wb") as f: + f.write(audio_dict["bytes"]) + + audio_paths.append(path) + + df.rename(columns={"label": "text"}, inplace=True) + + df["path"] = audio_paths + + df["id"] = df.index + idx + df = df.assign(text="").astype({"text": "str"}) + df = df.assign(metadata=[{"speaker_age": 0, "speaker_gender": gender} for gender in df["gender"]]).astype({"metadata": "object"}) + + df.drop(columns=["file", "is_valid", "language", "gender", "keyword"], inplace=True) + + for _, row in df.iterrows(): + yield idx, row.to_dict() + idx += 1 + + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/mtop_intent_classification/__init__.py b/seacrowd/sea_datasets/mtop_intent_classification/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/mtop_intent_classification/labels.py b/seacrowd/sea_datasets/mtop_intent_classification/labels.py new file mode 100644 index 000000000..9d7f6d76b --- /dev/null +++ b/seacrowd/sea_datasets/mtop_intent_classification/labels.py @@ -0,0 +1,126 @@ +DOMAIN_LABELS = [ + "messaging", + "calling", + "event", + "timer", + "music", + "weather", + "alarm", + "people", + "reminder", + "recipes", + "news", +] + +INTENT_LABELS = [ + "SEND_MESSAGE", + "GET_MESSAGE", + "GET_MESSAGE_CONTACT", + "CREATE_CALL", + "GET_CONTACT", + "GET_AVAILABILITY", + "SET_RSVP_INTERESTED", + "SET_RSVP_YES", + "SET_DEFAULT_PROVIDER_CALLING", + "PAUSE_TIMER", + "GET_SUNSET", + "GET_TIMER", + "SUBTRACT_TIME_TIMER", + "RESUME_TIMER", + "CREATE_ALARM", + "CREATE_TIMER", + "PLAY_MUSIC", + "SET_DEFAULT_PROVIDER_MUSIC", + "GET_STORIES_NEWS", + "UPDATE_TIMER", + "SHARE_EVENT", + "PLAY_MEDIA", + "UPDATE_CALL", + "STOP_SHUFFLE_MUSIC", + "GET_ALARM", + "GET_WEATHER", + "PREFER", + "GET_LOCATION", + "GET_DATE_TIME_EVENT", + "GET_EVENT", + "GET_CATEGORY_EVENT", + "UPDATE_METHOD_CALL", + "END_CALL", + "SET_UNAVAILABLE", + "GET_CALL_TIME", + "GET_CALL", + "SNOOZE_ALARM", + "SILENCE_ALARM", + "DELETE_ALARM", + "UNLOOP_MUSIC", + "UPDATE_ALARM", + "CREATE_PLAYLIST_MUSIC", + "DELETE_REMINDER", + "CREATE_REMINDER", + "ANSWER_CALL", + "SET_AVAILABLE", + "SWITCH_CALL", + "UPDATE_REMINDER_DATE_TIME", + "GET_REMINDER", + "GET_REMINDER_DATE_TIME", + "UPDATE_REMINDER_TODO", + "IGNORE_CALL", + "GET_INFO_CONTACT", + "GET_EMPLOYER", + "GET_EMPLOYMENT_TIME", + "GET_EDUCATION_TIME", + "GET_UNDERGRAD", + "GET_AGE", + "GET_EDUCATION_DEGREE", + "GET_INFO_RECIPES", + "GET_RECIPES", + "IS_TRUE_RECIPES", + "PREVIOUS_TRACK_MUSIC", + "QUESTION_NEWS", + "GET_DETAILS_NEWS", + "ADD_TIME_TIMER", + "LIKE_MUSIC", + "ADD_TO_PLAYLIST_MUSIC", + "QUESTION_MUSIC", + "SKIP_TRACK_MUSIC", + "LOOP_MUSIC", + "PAUSE_MUSIC", + "GET_TRACK_INFO_MUSIC", + "DISLIKE_MUSIC", + "RESTART_TIMER", + "DELETE_TIMER", + "DELETE_PLAYLIST_MUSIC", + "REMOVE_FROM_PLAYLIST_MUSIC", + "REWIND_MUSIC", + "REPLAY_MUSIC", + "START_SHUFFLE_MUSIC", + "STOP_MUSIC", + "FAST_FORWARD_MUSIC", + "FOLLOW_MUSIC", + "SET_RSVP_NO", + "GET_ATTENDEE_EVENT", + "GET_SUNRISE", + "HOLD_CALL", + "GET_CALL_CONTACT", + "GET_REMINDER_AMOUNT", + "HELP_REMINDER", + "GET_REMINDER_LOCATION", + "UPDATE_REMINDER", + "UPDATE_REMINDER_LOCATION", + "REPEAT_ALL_OFF_MUSIC", + "GET_CONTACT_METHOD", + "GET_JOB", + "GET_LIFE_EVENT", + "GET_GENDER", + "GET_MAJOR", + "GET_MUTUAL_FRIENDS", + "GET_LYRICS_MUSIC", + "CANCEL_MESSAGE", + "RESUME_CALL", + "REPEAT_ALL_MUSIC", + "RESUME_MUSIC", + "GET_LIFE_EVENT_TIME", + "GET_LANGUAGE", + "MERGE_CALL", + "GET_AIRQUALITY", +] \ No newline at end of file diff --git a/seacrowd/sea_datasets/mtop_intent_classification/mtop_intent_classification.py b/seacrowd/sea_datasets/mtop_intent_classification/mtop_intent_classification.py new file mode 100644 index 000000000..a8ff6ef4f --- /dev/null +++ b/seacrowd/sea_datasets/mtop_intent_classification/mtop_intent_classification.py @@ -0,0 +1,135 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.sea_datasets.mtop_intent_classification.labels import ( + DOMAIN_LABELS, INTENT_LABELS) +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{li-etal-2021-mtop, + author = {Li, Haoran and Arora, Abhinav and Chen, Shuochi and Gupta, Anchit and Gupta, Sonal and Mehdad, Yashar}, + title = {MTOP: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark}, + booktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume}, + publisher = {Association for Computational Linguistics}, + year = {2021}, + url = {https://aclanthology.org/2021.eacl-main.257}, + doi = {10.18653/v1/2021.eacl-main.257}, + pages = {2950-2962}, +} +""" +_LOCAL = False +_LANGUAGES = ["tha"] +_DATASETNAME = "mtop_intent_classification" +_DESCRIPTION = """ +This dataset contains annotated utterances from 6 languages, including Thai, +for semantic parsing. Queries corresponding to the chosen domains are crowdsourced. + Two subsets are included in this dataset: 'domain' (eg. 'news', 'people', 'weather') + and 'intent' (eg. 'GET_MESSAGE', 'STOP_MUSIC', 'END_CALL') +""" + +_HOMEPAGE = "https://huggingface.co/mteb" +_LICENSE = Licenses.CC_BY_SA_4_0.value # Found in original dataset (not HF) linked in paper +_URL = "https://huggingface.co/datasets/mteb/" + + +_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class MTOPIntentClassificationDataset(datasets.GeneratorBasedBuilder): + """Dataset of Thai sentences and their domains or intents.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SUBSETS = ["domain", "intent"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for {subset} subset", + schema="source", + subset_id=subset, + ) + for subset in SUBSETS + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for {subset} subset", + schema="seacrowd_text", + subset_id=subset, + ) + for subset in SUBSETS + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_domain_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("int64"), + "text": datasets.Value("string"), + "label": datasets.Value("int32"), + "label_text": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_text": + if self.config.subset_id == "domain": + labels = DOMAIN_LABELS + elif self.config.subset_id == "intent": + labels = INTENT_LABELS + else: + raise ValueError(f"Received unexpected schema name {self.config.name}") + features = schemas.text_features(label_names=labels) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + # dl_manager not used since dataloader uses HF `load_dataset` + return [datasets.SplitGenerator(name=split, gen_kwargs={"split": split._name}) for split in (datasets.Split.TRAIN, datasets.Split.VALIDATION, datasets.Split.TEST)] + + def _load_hf_data_from_remote(self, split: str) -> datasets.DatasetDict: + """Load dataset from HuggingFace.""" + if self.config.subset_id not in ("domain", "intent"): + raise ValueError(f"Received unexpected schema name {self.config.name}") + HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) + f"mtop_{self.config.subset_id}" + _hf_dataset_source = datasets.load_dataset(HF_REMOTE_REF, "th", split=split) + return _hf_dataset_source + + def _generate_examples(self, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data = self._load_hf_data_from_remote(split=split) + for index, row in enumerate(data): + if self.config.schema == "source": + example = row + + elif self.config.schema == "seacrowd_text": + example = {"id": str(index), "text": row["text"], "label": row["label_text"]} + yield index, example diff --git a/seacrowd/sea_datasets/muse/__init__.py b/seacrowd/sea_datasets/muse/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/muse/muse.py b/seacrowd/sea_datasets/muse/muse.py new file mode 100644 index 000000000..fa91d3258 --- /dev/null +++ b/seacrowd/sea_datasets/muse/muse.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Contains 110 large-scale ground-truth bilingual dictionaries created and released by Meta using an internal translation tool. +The dictionaries account for polysemy. The data comprises of a train and test split of 5000 and 1500 unique source words, as well as a larger set of up to 100k pairs. +It comprises of Europeans languages in every direction, and SEA languages to and from English. +""" + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{lample2018word, + title={Word translation without parallel data}, + author={Lample, Guillaume and Conneau, Alexis and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}}, + booktitle={International Conference on Learning Representations}, + year={2018}} +} +""" + +_DATASETNAME = "muse" + +_DESCRIPTION = """\ +Contains 110 large-scale ground-truth bilingual dictionaries created and released by Meta using an internal translation tool. +The dictionaries account for polysemy. The data comprises of a train and test split of 5000 and 1500 unique source words, as well as a larger set of up to 100k pairs. +It comprises of Europeans languages in every direction, and SEA languages to and from English. +""" + +_HOMEPAGE = "https://github.com/facebookresearch/MUSE#ground-truth-bilingual-dictionaries" + +_LANGUAGES = ["tgl", "ind", "zlm", "tha", "vie"] + +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value + +_LOCAL = False + +_TRAIN_URL_TEMPLATE = "https://dl.fbaipublicfiles.com/arrival/dictionaries/{src}-{tgt}.0-5000.txt" +_TEST_URL_TEMPLATE = "https://dl.fbaipublicfiles.com/arrival/dictionaries/{src}-{tgt}.5000-6500.txt" + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +configs = { + "tgl": ["eng"], + "ind": ["eng"], + "zlm": ["eng"], + "tha": ["eng"], + "vie": ["eng"], + "eng": ["tha", "vie", "tgl", "zlm", "ind"], +} + +langid_dict = { + "eng": "en", + "tgl": "tl", + "ind": "id", + "zlm": "ms", + "tha": "th", + "vie": "vi", +} + + +class MUSEDataset(datasets.GeneratorBasedBuilder): + """Large-scale ground-truth bilingual dictionaries""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_tgl_eng", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_tgl_eng", + ), + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{src_lang}_{tgt_lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{src_lang}_{tgt_lang}", + ) + for src_lang in configs + for tgt_lang in configs[src_lang] + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{src_lang}_{tgt_lang}_seacrowd_t2t", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_{src_lang}_{tgt_lang}", + ) + for src_lang in configs + for tgt_lang in configs[src_lang] + ] + ) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "src_text": datasets.Value("string"), + "tgt_text": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + + _, src_lang, tgt_lang = self.config.subset_id.split("_") + train_url = _TRAIN_URL_TEMPLATE.format(src=langid_dict[src_lang], tgt=langid_dict[tgt_lang]) + test_url = _TEST_URL_TEMPLATE.format(src=langid_dict[src_lang], tgt=langid_dict[tgt_lang]) + + train_file = dl_manager.download_and_extract(train_url) + test_file = dl_manager.download_and_extract(test_url) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "src_lang": src_lang, + "tgt_lang": tgt_lang, + "filepath": train_file, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "src_lang": src_lang, + "tgt_lang": tgt_lang, + "filepath": test_file, + }, + ), + ] + + def _generate_examples(self, src_lang: str, tgt_lang: str, filepath: Path) -> Tuple[int, Dict]: + if self.config.schema == "source": + for row_id, line in enumerate(open(filepath)): + src_text, tgt_text = line.strip().split("\t") + yield row_id, {"id": row_id, "src_text": src_text, "tgt_text": tgt_text} + + elif self.config.schema == "seacrowd_t2t": + for row_id, line in enumerate(open(filepath)): + src_text, tgt_text = line.strip().split("\t") + yield row_id, { + "id": row_id, + "text_1": src_text, + "text_2": tgt_text, + "text_1_name": src_lang, + "text_2_name": tgt_lang, + } diff --git a/seacrowd/sea_datasets/my_paraphrase/__init__.py b/seacrowd/sea_datasets/my_paraphrase/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py b/seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py new file mode 100644 index 000000000..486ece5a6 --- /dev/null +++ b/seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py @@ -0,0 +1,200 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{htay2022deep, + title={Deep Siamese Neural Network Vs Random Forest for Myanmar Language Paraphrase Classification}, + author={Htay, Myint Myint and Thu, Ye Kyaw and Thant, Hnin Aye and Supnithi, Thepchai}, + journal={Journal of Intelligent Informatics and Smart Technology}, + year={2022} +} +""" + +_DATASETNAME = "my_paraphrase" + +_DESCRIPTION = """\ +The myParaphrase corpus is intended for the task of assessing whether pairs of Burmese sentences exhibit similar meanings \ +or are paraphrases. It encompasses 40461 pairs for training, along with 1000 pairs for an open test and an additional 1000 pairs \ +for a closed test. If a pair of sentences in Burmese is considered a paraphrase, it is labeled with "1"; if not, they receive a label of "0." +""" + +_HOMEPAGE = "https://github.com/ye-kyaw-thu/myParaphrase" + +_LANGUAGES = ["mya"] + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value +_LOCAL = False + +_URLS = { + _DATASETNAME: [ + "https://github.com/ye-kyaw-thu/myParaphrase/raw/main/corpus/ver1.0/csv-qqp/train.csv", + "https://github.com/ye-kyaw-thu/myParaphrase/raw/main/corpus/ver1.0/csv-qqp/open-test.final.manual.csv", + "https://github.com/ye-kyaw-thu/myParaphrase/raw/main/corpus/ver1.0/csv-qqp/closed-test.csv", + ], +} + +_SUPPORTED_TASKS = [Tasks.PARAPHRASING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" +_TAGS = [0, 1] + + +class MyParaphraseDataset(datasets.GeneratorBasedBuilder): + """The "myParaphrase" corpus is a Burmese dataset used for paraphrase identification. \ + It includes 40,461 training pairs and 2,000 test pairs. Pairs are labeled "1" for paraphrases and "0" otherwise.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "t2t" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", # source + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="paraphrase_source", + subset_id=f"{_DATASETNAME}_paraphrase", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_paraphrase_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_paraphrase", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_non_paraphrase_source", # source + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="non_paraphrase_source", + subset_id=f"{_DATASETNAME}_non_paraphrase", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_non_paraphrase_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_non_paraphrase_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_non_paraphrase", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_all_source", # source + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="all_source", + subset_id=f"{_DATASETNAME}_all", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_all_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_all_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_all", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema.endswith("_source"): + features = datasets.Features({"id": datasets.Value("int32"), "paraphrase1": datasets.Value("string"), "paraphrase2": datasets.Value("string"), "is_paraphrase": datasets.Value("int32")}) + + elif self.config.schema.endswith(self.SEACROWD_SCHEMA_NAME): + features = schemas.text2text_features + + else: + raise ValueError + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + train = dl_manager.download(urls[0]) + open_test = dl_manager.download(urls[1]) + closed_test = dl_manager.download(urls[2]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": train, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": closed_test, + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": open_test, + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + columns = ["id", "paraphrase1", "paraphrase2", "is_paraphrase"] + dataset = pd.read_csv(filepath, header=None) + dataset.columns = columns + dataset = dataset.dropna() + + dataset["is_paraphrase"] = dataset["is_paraphrase"].astype(int) + + if self.config.schema in [ + "paraphrase_source", + "non_paraphrase_source", + "all_source", + # "source" + ]: + for i, row in dataset.iterrows(): + yield i, {"id": i, "paraphrase1": row["paraphrase1"], "paraphrase2": row["paraphrase2"], "is_paraphrase": row["is_paraphrase"]} + + elif self.config.schema == f"seacrowd_paraphrase_{self.SEACROWD_SCHEMA_NAME}": + for i, row in dataset[dataset["is_paraphrase"] == 1].iterrows(): + yield i, {"id": i, "text_1": row["paraphrase1"], "text_2": row["paraphrase2"], "text_1_name": "anchor_text", "text_2_name": "paraphrased_text"} + + elif self.config.schema == f"seacrowd_non_paraphrase_{self.SEACROWD_SCHEMA_NAME}": + for i, row in dataset[dataset["is_paraphrase"] == 0].iterrows(): + yield i, {"id": i, "text_1": row["paraphrase1"], "text_2": row["paraphrase2"], "text_1_name": "anchor_text", "text_2_name": "non_paraphrased_text"} + + elif self.config.schema == f"seacrowd_all_{self.SEACROWD_SCHEMA_NAME}": + for i, row in dataset.iterrows(): + yield i, {"id": i, "text_1": row["paraphrase1"], "text_2": row["paraphrase2"], "text_1_name": "anchor_text", "text_2_name": "paraphrased_text" if row["is_paraphrase"] else "non_paraphrased_text"} + + else: + raise ValueError \ No newline at end of file diff --git a/seacrowd/sea_datasets/myanmar_rakhine_parallel/__init__.py b/seacrowd/sea_datasets/myanmar_rakhine_parallel/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/myanmar_rakhine_parallel/myanmar_rakhine_parallel.py b/seacrowd/sea_datasets/myanmar_rakhine_parallel/myanmar_rakhine_parallel.py new file mode 100644 index 000000000..3a351d19e --- /dev/null +++ b/seacrowd/sea_datasets/myanmar_rakhine_parallel/myanmar_rakhine_parallel.py @@ -0,0 +1,179 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{myint-oo-etal-2019-neural, + title = "Neural Machine Translation between {M}yanmar ({B}urmese) and {R}akhine ({A}rakanese)", + author = "Myint Oo, Thazin and + Kyaw Thu, Ye and + Mar Soe, Khin", + editor = {Zampieri, Marcos and + Nakov, Preslav and + Malmasi, Shervin and + Ljube{\v{s}}i{\'c}, Nikola and + Tiedemann, J{\"o}rg and + Ali, Ahmed}, + booktitle = "Proceedings of the Sixth Workshop on {NLP} for Similar Languages, Varieties and Dialects", + month = jun, + year = "2019", + address = "Ann Arbor, Michigan", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/W19-1408", + doi = "10.18653/v1/W19-1408", + pages = "80--88", +} +""" + +_DATASETNAME = "myanmar_rakhine_parallel" +_DESCRIPTION = """\ +The data contains 18,373 Myanmar sentences of the ASEAN-MT Parallel Corpus, +which is a parallel corpus in the travel domain. It contains six main +categories: people (greeting, introduction, and communication), survival +(transportation, accommodation, and finance), food (food, beverages, and +restaurants), fun (recreation, traveling, shopping, and nightlife), resource +(number, time, and accuracy), special needs (emergency and health). Manual +translation into the Rakhine language was done by native Rakhine students from +two Myanmar universities, and the translated corpus was checked by the editor +of a Rakhine newspaper. Word segmentation for Rakhine was done manually, and +there are exactly 123,018 words in total. +""" + +_HOMEPAGE = "https://github.com/ye-kyaw-thu/myPar/tree/master/my-rk" +_LANGUAGES = ["mya", "rki"] +_LICENSE = Licenses.GPL_3_0.value +_LOCAL = False +_URLS = { + "train_mya": "https://raw.githubusercontent.com/ye-kyaw-thu/myPar/master/my-rk/ver-0.1/train.my", + "dev_mya": "https://raw.githubusercontent.com/ye-kyaw-thu/myPar/master/my-rk/ver-0.1/dev.my", + "test_mya": "https://raw.githubusercontent.com/ye-kyaw-thu/myPar/master/my-rk/ver-0.1/test.my", + "train_rki": "https://raw.githubusercontent.com/ye-kyaw-thu/myPar/master/my-rk/ver-0.1/train.rk", + "dev_rki": "https://raw.githubusercontent.com/ye-kyaw-thu/myPar/master/my-rk/ver-0.1/dev.rk", + "test_rki": "https://raw.githubusercontent.com/ye-kyaw-thu/myPar/master/my-rk/ver-0.1/test.rk", +} +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "0.1.0" +_SEACROWD_VERSION = "1.0.0" + + +class MyanmarRakhineParallel(datasets.GeneratorBasedBuilder): + """Myanmar-Rakhine Parallel dataset from https://github.com/ye-kyaw-thu/myPar/tree/master/my-rk""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "t2t" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source" or self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text2text_features + else: + raise ValueError(f"Invalid config schema: {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_paths = { + "train_mya": Path(dl_manager.download_and_extract(_URLS["train_mya"])), + "dev_mya": Path(dl_manager.download_and_extract(_URLS["dev_mya"])), + "test_mya": Path(dl_manager.download_and_extract(_URLS["test_mya"])), + "train_rki": Path(dl_manager.download_and_extract(_URLS["train_rki"])), + "dev_rki": Path(dl_manager.download_and_extract(_URLS["dev_rki"])), + "test_rki": Path(dl_manager.download_and_extract(_URLS["test_rki"])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "mya_filepath": data_paths["train_mya"], + "rki_filepath": data_paths["train_rki"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "mya_filepath": data_paths["test_mya"], + "rki_filepath": data_paths["test_rki"], + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "mya_filepath": data_paths["dev_mya"], + "rki_filepath": data_paths["dev_rki"], + "split": "dev", + }, + ), + ] + + def _generate_examples(self, mya_filepath: Path, rki_filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + # read mya file + with open(mya_filepath, "r", encoding="utf-8") as mya_file: + mya_data = mya_file.readlines() + mya_data = [s.strip("\n") for s in mya_data] + + # read rki file + with open(rki_filepath, "r", encoding="utf-8") as rki_file: + rki_data = rki_file.readlines() + rki_data = [s.strip("\n") for s in rki_data] + + num_sample = len(mya_data) + + for i in range(num_sample): + if self.config.schema == "source" or self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = {"id": str(i), "text_1": mya_data[i], "text_2": rki_data[i], "text_1_name": "mya", "text_2_name": "rki"} + yield i, example diff --git a/seacrowd/sea_datasets/mysentence/__init__.py b/seacrowd/sea_datasets/mysentence/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/mysentence/mysentence.py b/seacrowd/sea_datasets/mysentence/mysentence.py new file mode 100644 index 000000000..d4141a8a1 --- /dev/null +++ b/seacrowd/sea_datasets/mysentence/mysentence.py @@ -0,0 +1,170 @@ +# coding=utf-8 +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{Aung_Kyaw Thu_Hlaing_2023, place={Nonthaburi, Thailand}, title={mySentence: Sentence Segmentation for Myanmar Language +using Neural Machine Translation Approach}, volume={9}, url={https://ph05.tci-thaijo.org/index.php/JIIST/article/view/87}, +number={October}, +abstract="In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence. +Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat +ically experimented with twelve neural sequence +labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%} +while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.", +journal={Journal of Intelligent Informatics +and Smart Technology}, author={Aung, Thura and Kyaw Thu , Ye and Hlaing , Zar Zar}, year={2023}, month={Nov.}, pages={e001} }; + +@InProceedings{10.1007/978-3-031-36886-8_24, +author="Thu, Ye Kyaw +and Aung, Thura +and Supnithi, Thepchai", +editor="Nguyen, Ngoc Thanh +and Le-Minh, Hoa +and Huynh, Cong-Phap +and Nguyen, Quang-Vu", +title="Neural Sequence Labeling Based Sentence Segmentation for Myanmar Language", +booktitle="The 12th Conference on Information Technology and Its Applications", +year="2023", +publisher="Springer Nature Switzerland", +address="Cham", +pages="285--296", +abstract="In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence. +Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat +ically experimented with twelve neural sequence +labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%} +while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.", +isbn="978-3-031-36886-8" +} + +""" + +_DATASETNAME = "mysentence" +_DESCRIPTION = """\ +mySentence is a corpus with a total size of around 55K for Myanmar sentence segmentation. In formal Burmese (Myanmar language), sentences are grammatically structured +and typically end with the "။" pote-ma symbol. However, informal language, more commonly used in daily conversations due to its natural flow, does not always follow predefined +rules for ending sentences, making it challenging for machines to identify sentence boundaries. In this corpus, each token of the sentences and paragraphs is tagged from start to finish. +""" + +_HOMEPAGE = "https://github.com/ye-kyaw-thu/mySentence" +_LANGUAGES = ["mya"] +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value +_LOCAL = False +_URLS = { + "sent": { + "train": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent/sent_tagged/train.tagged", + "valid": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent/sent_tagged/valid.tagged", + "test": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent/sent_tagged/test.tagged", + }, + "sent+para": { + "train": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent+para/sent+para_tagged/train.tagged", + "valid": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent+para/sent+para_tagged/valid.tagged", + "test": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent+para/sent+para_tagged/test.tagged", + }, +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class MysentenceDataset(datasets.GeneratorBasedBuilder): + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=_DESCRIPTION, + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description="sentences SEACrowd schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_and_paragraphs_source", + version=SOURCE_VERSION, + description="sentences para source schema", + schema="source", + subset_id=f"{_DATASETNAME}_and_paragraphs", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_and_paragraphs_seacrowd_seq_label", + version=SEACROWD_VERSION, + description="sentence para SEACrowd schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_and_paragraphs", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "labels": datasets.Sequence(datasets.Value("string")), + } + ) + else: + features = schemas.seq_label_features(["B", "O", "N", "E"]) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, # B (Begin), O (Other), N (Next), and E (End) + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + if self.config.subset_id == f"{_DATASETNAME}": + DATA_URL_ = _URLS["sent"] + elif self.config.subset_id == f"{_DATASETNAME}_and_paragraphs": + DATA_URL_ = _URLS["sent+para"] + else: + raise ValueError(f"No related dataset id for {self.config.subset_id}") + + data_dir = dl_manager.download_and_extract(DATA_URL_) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_dir["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_dir["test"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir["valid"], + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + + with open(filepath, "r") as filein: + examples = [line.strip("\n").split(" ") for line in filein.readlines()] + for eid, exam in enumerate(examples): + tokens = [] + pos = [] + for tok_chunk in exam: + tok_ = tok_chunk.split("/") + tokens.append(tok_[0]) + pos.append(tok_[1]) + yield eid, {"id": str(eid), "tokens": tokens, "labels": pos} diff --git a/seacrowd/sea_datasets/myxnli/__init__.py b/seacrowd/sea_datasets/myxnli/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/myxnli/myxnli.py b/seacrowd/sea_datasets/myxnli/myxnli.py new file mode 100644 index 000000000..043d5676e --- /dev/null +++ b/seacrowd/sea_datasets/myxnli/myxnli.py @@ -0,0 +1,143 @@ +from pathlib import Path + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + + +_CITATION = """ +@misc{myXNLI2023, + title = "myXNLI", + author = "akhtet", + year = "202", + url = "https://github.com/akhtet/myXNLI", +} +""" + +_DATASETNAME = "myxnli" + +_DESCRIPTION = """ +The myXNLI corpus is a collection of Myanmar language data designed for the Natural Language Inference (NLI) task, which +originated from the XNLI and MultiNLI English datasets. The 7,500 sentence pairs from the XNLI English development and +test sets are human-translated into Myanmar. The 392,702 data from the NLI English training data is translated using +machine translation. In addition, it also extends its scope by adding Myanmar translations to the XNLI 15-language +parallel corpus, to create a 16-language parallel corpus. +""" + +_HOMEPAGE = "https://github.com/akhtet/myXNLI" + +_LANGUAGES = ["mya"] + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + "train": "https://huggingface.co/datasets/akhtet/myXNLI/resolve/main/data/train-00000-of-00001-2614419e00195781.parquet", + "dev": "https://huggingface.co/datasets/akhtet/myXNLI/resolve/main/data/validation-00000-of-00001-9c168eb31d1d810b.parquet", + "test": "https://huggingface.co/datasets/akhtet/myXNLI/resolve/main/data/test-00000-of-00001-0fd9f93baf8c9cdb.parquet", + }, +} + +_SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT] + +_SOURCE_VERSION = "1.1.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MyXNLIDataset(datasets.GeneratorBasedBuilder): + """The myXNLI corpus is a collection of Myanmar language data designed for the Natural Language Inference task.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_pairs", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_pairs", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "genre": datasets.Value("string"), + "label": datasets.ClassLabel(names=["contradiction", "entailment", "neutral"]), + "sentence1_en": datasets.Value("string"), + "sentence2_en": datasets.Value("string"), + "sentence1_my": datasets.Value("string"), + "sentence2_my": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_pairs": + features = schemas.pairs_features(["contradiction", "entailment", "neutral"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> list[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_dir, "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_dir, "split": "test"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_dir, "split": "dev"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> tuple[int, dict]: + if self.config.schema == "source": + df = pd.read_parquet(filepath[split]) + for i, row in df.iterrows(): + yield i, { + "genre": row["genre"], + "label": row["label"], + "sentence1_en": row["sentence1_en"], + "sentence2_en": row["sentence2_en"], + "sentence1_my": row["sentence1_my"], + "sentence2_my": row["sentence2_my"], + } + + elif self.config.schema == "seacrowd_pairs": + df = pd.read_parquet(filepath[split]) + for i, row in df.iterrows(): + yield i, { + "id": str(i), + "text_1": row["sentence1_my"], + "text_2": row["sentence2_my"], + "label": row["label"], + } diff --git a/seacrowd/sea_datasets/newsph/__init__.py b/seacrowd/sea_datasets/newsph/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/newsph/newsph.py b/seacrowd/sea_datasets/newsph/newsph.py new file mode 100644 index 000000000..bb77990c4 --- /dev/null +++ b/seacrowd/sea_datasets/newsph/newsph.py @@ -0,0 +1,109 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{cruz2021exploiting, + title={Exploiting news article structure for automatic corpus generation of entailment datasets}, + author={Cruz, Jan Christian Blaise and Resabal, Jose Kristian and Lin, James and Velasco, Dan John and Cheng, Charibeth}, + booktitle={PRICAI 2021: Trends in Artificial Intelligence: 18th Pacific Rim International Conference on Artificial Intelligence, PRICAI 2021, Hanoi, Vietnam, November 8--12, 2021, Proceedings, Part II 18}, + pages={86--99}, + year={2021}, + organization={Springer} +} +""" +_DATASETNAME = "newsph" +_LANGUAGES = ["fil", "tgl"] +_DESCRIPTION = """\ +Raw collection of news articles in Filipino which can be used for language modelling. +""" +_HOMEPAGE = "https://huggingface.co/datasets/newsph" +_LICENSE = Licenses.GPL_3_0.value +_LOCAL = False +_URLS = "https://s3.us-east-2.amazonaws.com/blaisecruz.com/datasets/newsph/newsph.zip" +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class NewsPhDataset(datasets.GeneratorBasedBuilder): + """ + Raw collection of news articles in Filipino which can be used for language modelling. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="newsph_source", + version=SOURCE_VERSION, + description="newsph source schema", + schema="source", + subset_id="newsph", + ), + SEACrowdConfig( + name="newsph_seacrowd_ssp", + version=SEACROWD_VERSION, + description="newsph SEACrowd schema", + schema="seacrowd_ssp", + subset_id="newsph", + ), + ] + + DEFAULT_CONFIG_NAME = "newsph_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_ssp": + features = schemas.self_supervised_pretraining.features + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_dir = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "newsph", "train.txt"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + if self.config.schema == "source" or self.config.schema == "seacrowd_ssp": + with open(filepath, encoding="utf-8") as f: + for idx, row in enumerate(f): + if row.strip(): + yield idx, {"id": str(idx), "text": row} + else: + yield idx, {"id": str(idx), "text": ""} + else: + raise NotImplementedError diff --git a/seacrowd/sea_datasets/ntrex_128/__init__.py b/seacrowd/sea_datasets/ntrex_128/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/ntrex_128/ntrex_128.py b/seacrowd/sea_datasets/ntrex_128/ntrex_128.py new file mode 100644 index 000000000..674b492e2 --- /dev/null +++ b/seacrowd/sea_datasets/ntrex_128/ntrex_128.py @@ -0,0 +1,444 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +NTREX-128, a data set for machine translation (MT) evaluation, includes 123 documents \ +(1,997 sentences, 42k words) translated from English into 128 target languages. \ +9 languages are natively spoken in Southeast Asia, i.e., Burmese, Filipino, \ +Hmong, Indonesian, Khmer, Lao, Malay, Thai, and Vietnamese. +""" +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{federmann-etal-2022-ntrex, + title = "{NTREX}-128 {--} News Test References for {MT} Evaluation of 128 Languages", + author = "Federmann, Christian and + Kocmi, Tom and + Xin, Ying", + editor = "Ahuja, Kabir and + Anastasopoulos, Antonios and + Patra, Barun and + Neubig, Graham and + Choudhury, Monojit and + Dandapat, Sandipan and + Sitaram, Sunayana and + Chaudhary, Vishrav", + booktitle = "Proceedings of the First Workshop on Scaling Up Multilingual Evaluation", + month = nov, + year = "2022", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.sumeval-1.4", + pages = "21--24", +} +""" + +_DATASETNAME = "ntrex_128" + +_DESCRIPTION = """\ +NTREX-128, a data set for machine translation (MT) evaluation, includes 123 documents \ +(1,997 sentences, 42k words) translated from English into 128 target languages. \ +9 languages are natively spoken in Southeast Asia, i.e., Burmese, Filipino, \ +Hmong, Indonesian, Khmer, Lao, Malay, Thai, and Vietnamese. +""" + +_HOMEPAGE = "https://github.com/MicrosoftTranslator/NTREX" + +_LANGUAGES = ["mya", "fil", "ind", "khm", "lao", "zlm", "tha", "vie", "hmv"] + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_LOCAL = False + +# _MAPPING = {"mya": "mya", "fil": "fil", "ind": "ind", "khm": "khm", "lao": "lao", "zlm": "msa", "tha": "tha", "vie": "vie", "hmv": "hmn"} +_MAPPING = { + "afr": "afr", + "amh": "amh", + "arb": "arb", + "aze-Latn": "aze-Latn", + "bak": "bak", + "bel": "bel", + "bem": "bem", + "ben": "ben", + "bod": "bod", + "bos": "bos", + "bul": "bul", + "cat": "cat", + "ces": "ces", + "ckb-Arab": "ckb-Arab", + "cym": "cym", + "dan": "dan", + "deu": "deu", + "div": "div", + "dzo": "dzo", + "ell": "ell", + "eng-GB": "eng-GB", + "eng-IN": "eng-IN", + "eng-US": "eng-US", + "est": "est", + "eus": "eus", + "ewe": "ewe", + "fao": "fao", + "fas": "fas", + "fij": "fij", + "fil": "fil", + "fin": "fin", + "fra": "fra", + "fra-CA": "fra-CA", + "fuc": "fuc", + "gle": "gle", + "glg": "glg", + "guj": "guj", + "hau": "hau", + "heb": "heb", + "hin": "hin", + "hmv": "hmn", + "hrv": "hrv", + "hun": "hun", + "hye": "hye", + "ibo": "ibo", + "ind": "ind", + "isl": "isl", + "ita": "ita", + "jpn": "jpn", + "kan": "kan", + "kat": "kat", + "kaz": "kaz", + "khm": "khm", + "kin": "kin", + "kir": "kir", + "kmr": "kmr", + "kor": "kor", + "lao": "lao", + "lav": "lav", + "lit": "lit", + "ltz": "ltz", + "mal": "mal", + "mar": "mar", + "mey": "mey", + "mkd": "mkd", + "mlg": "mlg", + "mlt": "mlt", + "mon": "mon", + "mri": "mri", + "zlm": "msa", + "mya": "mya", + "nde": "nde", + "nep": "nep", + "nld": "nld", + "nno": "nno", + "nob": "nob", + "nso": "nso", + "nya": "nya", + "orm": "orm", + "pan": "pan", + "pol": "pol", + "por": "por", + "por-BR": "por-BR", + "prs": "prs", + "pus": "pus", + "ron": "ron", + "rus": "rus", + "shi": "shi", + "sin": "sin", + "slk": "slk", + "slv": "slv", + "smo": "smo", + "sna-Latn": "sna-Latn", + "snd-Arab": "snd-Arab", + "som": "som", + "spa": "spa", + "spa-MX": "spa-MX", + "sqi": "sqi", + "srp-Cyrl": "srp-Cyrl", + "srp-Latn": "srp-Latn", + "ssw": "ssw", + "swa": "swa", + "swe": "swe", + "tah": "tah", + "tam": "tam", + "tat": "tat", + "tel": "tel", + "tgk-Cyrl": "tgk-Cyrl", + "tha": "tha", + "tir": "tir", + "ton": "ton", + "tsn": "tsn", + "tuk": "tuk", + "tur": "tur", + "uig": "uig", + "ukr": "ukr", + "urd": "urd", + "uzb": "uzb", + "ven": "ven", + "vie": "vie", + "wol": "wol", + "xho": "xho", + "yor": "yor", + "yue": "yue", + "zho-CN": "zho-CN", + "zho-TW": "zho-TW", + "zul": "zul", +} +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/MicrosoftTranslator/NTREX/main/NTREX-128/newstest2019-ref.{lang}.txt", +} + +_ALL_LANG = [ + "afr", + "amh", + "arb", + "aze-Latn", + "bak", + "bel", + "bem", + "ben", + "bod", + "bos", + "bul", + "cat", + "ces", + "ckb-Arab", + "cym", + "dan", + "deu", + "div", + "dzo", + "ell", + "eng-GB", + "eng-IN", + "eng-US", + "est", + "eus", + "ewe", + "fao", + "fas", + "fij", + "fil", + "fin", + "fra", + "fra-CA", + "fuc", + "gle", + "glg", + "guj", + "hau", + "heb", + "hin", + "hmv", + "hrv", + "hun", + "hye", + "ibo", + "ind", + "isl", + "ita", + "jpn", + "kan", + "kat", + "kaz", + "khm", + "kin", + "kir", + "kmr", + "kor", + "lao", + "lav", + "lit", + "ltz", + "mal", + "mar", + "mey", + "mkd", + "mlg", + "mlt", + "mon", + "mri", + "zlm", + "mya", + "nde", + "nep", + "nld", + "nno", + "nob", + "nso", + "nya", + "orm", + "pan", + "pol", + "por", + "por-BR", + "prs", + "pus", + "ron", + "rus", + "shi", + "sin", + "slk", + "slv", + "smo", + "sna-Latn", + "snd-Arab", + "som", + "spa", + "spa-MX", + "sqi", + "srp-Cyrl", + "srp-Latn", + "ssw", + "swa", + "swe", + "tah", + "tam", + "tat", + "tel", + "tgk-Cyrl", + "tha", + "tir", + "ton", + "tsn", + "tuk", + "tur", + "uig", + "ukr", + "urd", + "uzb", + "ven", + "vie", + "wol", + "xho", + "yor", + "yue", + "zho-CN", + "zho-TW", + "zul", +] + +# aze-Latn: Azerbaijani (Latin) +# ckb-Arab: Central Kurdish (Sorani) +# eng-GB: English (British), eng-IN: English (India), eng-US: English (US) +# fra: French, fra-CA: French (Canada) +# mya: Myanmar +# por: Portuguese, por-BR: Portuguese (Brazil) +# shi: Shilha +# sna-Latn: Shona (Latin) +# snd-Arab: Sindhi (Arabic) +# spa: Spanish, spa-MX: Spanish (Mexico) +# srp-Cyrl: Serbian (Cyrillic), srp-Latn: Serbian (Latin) +# tgk-Cyrl: Tajik (Cyrillic) +# yue: Cantonese +# zho-CN: Chinese (Simplified), zho-TW: Chinese (Traditional) + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "11.24.2022" + +_SEACROWD_VERSION = "1.0.0" + + +class Ntrex128Dataset(datasets.GeneratorBasedBuilder): + """NTREX-128, a data set for machine translation (MT) evaluation, includes 123 documents \ + (1,997 sentences, 42k words) translated from English into 128 target languages. \ + 9 languages are natively spoken in Southeast Asia, i.e., Burmese, Filipino, \ + Hmong, Indonesian, Khmer, Lao, Malay, Thai, and Vietnamese.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset1}_{subset2}_source", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} {subset1}2{subset2} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{subset1}_{subset2}", + ) + for subset2 in _ALL_LANG + for subset1 in _ALL_LANG + if subset1 != subset2 and (subset1 in _LANGUAGES or subset2 in _LANGUAGES) + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset1}_{subset2}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} {subset1}2{subset2} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_{subset1}_{subset2}", + ) + for subset2 in _ALL_LANG + for subset1 in _ALL_LANG + if subset1 != subset2 and (subset1 in _LANGUAGES or subset2 in _LANGUAGES) + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_mya_fil_source" + + def _info(self): + # The format of the source is just texts in different .txt files (each file corresponds to one language). + # Decided make source schema the same as the seacrowd_t2t schema. + if self.config.schema == "source" or self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + lang1 = self.config.name.split("_")[2] + lang2 = self.config.name.split("_")[3] + lang1_txt_path = Path(dl_manager.download_and_extract(_URLS[_DATASETNAME].format(lang=_MAPPING[lang1]))) + lang2_txt_path = Path(dl_manager.download_and_extract(_URLS[_DATASETNAME].format(lang=_MAPPING[lang2]))) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": [lang1_txt_path, lang2_txt_path]}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + lang1 = self.config.name.split("_")[2] + lang2 = self.config.name.split("_")[3] + + texts1 = [] + texts2 = [] + texts1 = open(filepath[0], "r").readlines() + texts2 = open(filepath[1], "r").readlines() + + if self.config.schema == "source" or self.config.schema == "seacrowd_t2t": + idx = 0 + for line1, line2 in zip(texts1, texts2): + ex = { + "id": str(idx), + "text_1": line1, + "text_2": line2, + "text_1_name": lang1, + "text_2_name": lang2, + } + yield idx, ex + idx += 1 + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/nusaparagraph_emot/nusaparagraph_emot.py b/seacrowd/sea_datasets/nusaparagraph_emot/nusaparagraph_emot.py index c1782487f..177f88d95 100644 --- a/seacrowd/sea_datasets/nusaparagraph_emot/nusaparagraph_emot.py +++ b/seacrowd/sea_datasets/nusaparagraph_emot/nusaparagraph_emot.py @@ -167,4 +167,4 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: df = pd.read_csv(filepath).reset_index() for row in df.itertuples(): ex = {"id": str(row.id), "text": row.text, "label": row.label} - yield row.id, ex \ No newline at end of file + yield row.id, ex diff --git a/seacrowd/sea_datasets/oil/__init__.py b/seacrowd/sea_datasets/oil/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/oil/oil.py b/seacrowd/sea_datasets/oil/oil.py new file mode 100644 index 000000000..9985dae15 --- /dev/null +++ b/seacrowd/sea_datasets/oil/oil.py @@ -0,0 +1,149 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses + +_CITATION = """\ +@inproceedings{maxwelll-smith-foley-2023-automated, +title = "Automated speech recognition of {I}ndonesian-{E}nglish language lessons on {Y}ou{T}ube using transfer learning", +author = "Maxwell-Smith, Zara and Foley, Ben", +editor = "Serikov, Oleg + and Voloshina, Ekaterina + and Postnikova, Anna + and Klyachko, Elena + and Vylomova, Ekaterina + and Shavrina, Tatiana + and Le Ferrand, Eric + and Malykh, Valentin + and Tyers, Francis + and Arkhangelskiy, Timofey + and Mikhailov, Vladislav", + booktitle = "Proceedings of the Second Workshop on NLP Applications to Field Linguistics", + month = may, + year = "2023", + address = "Dubrovnik, Croatia", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.fieldmatters-1.1", + doi = "10.18653/v1/2023.fieldmatters-1.1", + pages = "1--16", + abstract = "Experiments to fine-tune large multilingual models with limited data from a specific domain or setting has potential + to improve automatic speech recognition (ASR) outcomes. This paper reports on the use of the Elpis ASR pipeline to fine-tune two + pre-trained base models, Wav2Vec2-XLSR-53 and Wav2Vec2-Large-XLSR-Indonesian, with various mixes of data from 3 YouTube channels + teaching Indonesian with English as the language of instruction. We discuss our results inferring new lesson audio (22-46% + word error rate) in the context of speeding data collection in diverse and specialised settings. This study is an example of how + ASR can be used to accelerate natural language research, expanding ethically sourced data in low-resource settings.", +} +""" + +_DATASETNAME = "oil" + +_DESCRIPTION = """\ +The Online Indonesian Learning (OIL) dataset or corpus currently contains lessons from three Indonesian teachers who have posted content on YouTube. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/ZMaxwell-Smith/OIL" + +_LANGUAGES = ["eng", "ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: {"train": "https://huggingface.co/api/datasets/ZMaxwell-Smith/OIL/parquet/default/train/0.parquet"}, +} + +_SUPPORTED_TASKS = [] +_SUPPORTED_SCHEMA_STRINGS = [f"seacrowd_{str(TASK_TO_SCHEMA[task]).lower()}" for task in _SUPPORTED_TASKS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class OIL(datasets.GeneratorBasedBuilder): + """The Online Indonesian Learning (OIL) dataset or corpus currently contains lessons from three Indonesian teachers who have posted content on YouTube.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "audio": datasets.Audio(decode=False), + "label": datasets.ClassLabel(num_classes=98), + } + ) + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + train_path = dl_manager.download_and_extract(urls["train"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_path, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + + df = pd.read_parquet(filepath) + + for index, row in df.iterrows(): + yield index, row.to_dict() + + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/openlid/__init__.py b/seacrowd/sea_datasets/openlid/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/openlid/openlid.py b/seacrowd/sea_datasets/openlid/openlid.py new file mode 100644 index 000000000..a5cffe064 --- /dev/null +++ b/seacrowd/sea_datasets/openlid/openlid.py @@ -0,0 +1,140 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{burchell-etal-2023-open, + title = "An Open Dataset and Model for Language Identification", + author = "Burchell, Laurie and + Birch, Alexandra and + Bogoychev, Nikolay and + Heafield, Kenneth", + editor = "Rogers, Anna and + Boyd-Graber, Jordan and + Okazaki, Naoaki", + booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", + month = jul, + year = "2023", + address = "Toronto, Canada", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.acl-short.75", + doi = "10.18653/v1/2023.acl-short.75", + pages = "865--879", + abstract = "Language identification (LID) is a fundamental step in many natural language processing pipelines. However, current LID + systems are far from perfect, particularly on lower-resource languages. We present a LID model which achieves a macro-average F1 + score of 0.93 and a false positive rate of 0.033{\%} across 201 languages, outperforming previous work. We achieve this by training + on a curated dataset of monolingual data, which we audit manually to ensure reliability. We make both the model and the dataset + available to the research community. Finally, we carry out detailed analysis into our model{'}s performance, both in comparison to + existing open models and by language class.", +} +""" + +_LOCAL = False +_LANGUAGES = ["ace", "ban", "bjn", "bug", "ceb", "ilo", "ind", "jav", "kac", "khm", "lao", "min", "lus", "mya", "pag", "shn", "sun", "tgl", "tha", "vie", "war", "zsm"] +_DATASETNAME = "openlid" + +_DESCRIPTION = """\ +This is an open dataset for language identification covering 201 languages, which are curated and audited manually to +ensure high confidence in its data and language labels. 22 languages are native to Southeast Asia speakers. +""" + +_HOMEPAGE = "https://github.com/laurieburchell/open-lid-dataset" +_LICENSE = Licenses.GPL_3_0.value +_URLS = { + _DATASETNAME: "https://data.statmt.org/lid/lid201-data.tsv.gz", +} +_SUPPORTED_TASKS = [Tasks.LANGUAGE_IDENTIFICATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +# 201 languages. Each element contains a code for the language, and script (e.g. wol_Latn = Wolof in Latin script) +_TAGS = ['kbp_Latn', 'zul_Latn', 'zho_Hans', 'uig_Arab', 'smo_Latn', 'hrv_Latn', 'tgk_Cyrl', 'guj_Gujr', 'azj_Latn', 'mai_Deva', 'bul_Cyrl', 'hne_Deva', 'wol_Latn', 'ind_Latn', 'lit_Latn', 'epo_Latn', 'prs_Arab', 'kmr_Latn', 'fao_Latn', 'swh_Latn', 'slk_Latn', 'srp_Cyrl', 'bod_Tibt', 'eus_Latn', 'tir_Ethi', 'tam_Taml', 'kas_Deva', 'glg_Latn', 'crh_Latn', 'kon_Latn', 'ayr_Latn', 'por_Latn', 'ben_Beng', 'zho_Hant', 'bug_Latn', 'umb_Latn', 'tzm_Tfng', 'kan_Knda', 'tgl_Latn', 'luo_Latn', 'lij_Latn', 'hun_Latn', 'kin_Latn', 'hat_Latn', 'sag_Latn', 'khm_Khmr', 'heb_Hebr', 'hye_Armn', 'fuv_Latn', 'cjk_Latn', 'ckb_Arab', 'srd_Latn', 'cat_Latn', 'dan_Latn', 'lao_Laoo', 'fra_Latn', 'kam_Latn', 'aeb_Arab', 'ydd_Hebr', 'afr_Latn', 'khk_Cyrl', 'lug_Latn', 'lin_Latn', 'nya_Latn', 'tsn_Latn', 'dzo_Tibt', 'min_Latn', 'war_Latn', 'rus_Cyrl', 'nob_Latn', 'tpi_Latn', 'mlt_Latn', 'mni_Beng', 'ilo_Latn', 'amh_Ethi', 'taq_Latn', 'acq_Arab', 'gaz_Latn', 'ltg_Latn', 'kac_Latn', 'ibo_Latn', 'gle_Latn', 'mya_Mymr', 'grn_Latn', 'kik_Latn', 'jav_Latn', 'awa_Deva', 'ars_Arab', 'swe_Latn', 'uzn_Latn', 'mos_Latn', 'lus_Latn', 'mal_Mlym', 'ita_Latn', 'dik_Latn', 'ewe_Latn', 'sat_Olck', 'pan_Guru', 'est_Latn', 'kab_Latn', 'bam_Latn', 'pag_Latn', 'isl_Latn', 'eng_Latn', 'fon_Latn', 'kas_Arab', 'asm_Beng', 'lim_Latn', 'bjn_Arab', 'taq_Tfng', 'deu_Latn', 'pbt_Arab', 'pap_Latn', 'quy_Latn', 'kea_Latn', 'npi_Deva', 'xho_Latn', 'shn_Mymr', 'nso_Latn', 'urd_Arab', 'bos_Latn', 'ron_Latn', 'fur_Latn', 'gla_Latn', 'nus_Latn', 'ltz_Latn', 'arz_Arab', 'bem_Latn', 'fin_Latn', 'kir_Cyrl', 'tha_Thai', 'mag_Deva', 'azb_Arab', 'tel_Telu', 'ell_Grek', 'sot_Latn', 'spa_Latn', 'vie_Latn', 'yor_Latn', 'ceb_Latn', 'vec_Latn', 'sin_Sinh', 'pol_Latn', 'als_Latn', 'lmo_Latn', 'scn_Latn', 'ces_Latn', 'fij_Latn', 'run_Latn', 'som_Latn', 'mkd_Cyrl', 'mar_Deva', 'ast_Latn', 'san_Deva', 'ary_Arab', 'twi_Latn', 'acm_Arab', 'nno_Latn', 'zsm_Latn', 'mri_Latn', 'kor_Hang', 'sna_Latn', 'pes_Arab', 'ace_Latn', 'bak_Cyrl', 'kat_Geor', 'tur_Latn', 'jpn_Jpan', 'arb_Arab', 'ukr_Cyrl', 'yue_Hant', 'kaz_Cyrl', 'hau_Latn', 'nld_Latn', 'oci_Latn', 'apc_Arab', 'tum_Latn', 'ace_Arab', 'dyu_Latn', 'knc_Latn', 'knc_Arab', 'kmb_Latn', 'bel_Cyrl', 'slv_Latn', 'lvs_Latn', 'bho_Deva', 'tuk_Latn', 'snd_Arab', 'sun_Latn', 'lua_Latn', 'ajp_Arab', 'hin_Deva', 'tso_Latn', 'tat_Cyrl', 'cym_Latn', 'ory_Orya', 'ban_Latn', 'szl_Latn', 'plt_Latn', 'bjn_Latn', 'ssw_Latn'] + + +class OpenLID(datasets.GeneratorBasedBuilder): + """This is an open dataset for language identification covering 201 languages. 22 languages are native to Southeast Asia speakers.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="openlid_source", + version=SOURCE_VERSION, + description="OpenLID source schema", + schema="source", + subset_id="openlid", + ), + SEACrowdConfig( + name="openlid_seacrowd_text", + version=SEACROWD_VERSION, + description="OpenLID Nusantara schema", + schema="seacrowd_text", + subset_id="openlid", + ), + ] + + DEFAULT_CONFIG_NAME = "openlid_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string"), "label": datasets.Value("string"), "source": datasets.Value("string")}) + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(_TAGS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # Dataset does not have predetermined split, putting all as TRAIN + urls = _URLS[_DATASETNAME] + filepath = Path(dl_manager.download_and_extract(urls)) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": filepath, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # Dataset does not have id, using row index as id + with open(filepath) as f: + lines = f.readlines() + + if self.config.schema == "source": + for _id, line in enumerate(lines): + line = line.split("\t") + ex = { + "id": str(_id), + "text": line[0], + "label": line[1], + "source": line[2].strip(), + } + yield _id, ex + + elif self.config.schema == "seacrowd_text": + for _id, line in enumerate(lines): + line = line.split("\t") + ex = { + "id": str(_id), + "text": line[0], + "label": line[1], + } + yield _id, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/openslr/__init__.py b/seacrowd/sea_datasets/openslr/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/openslr/openslr.py b/seacrowd/sea_datasets/openslr/openslr.py new file mode 100644 index 000000000..d2b799cd8 --- /dev/null +++ b/seacrowd/sea_datasets/openslr/openslr.py @@ -0,0 +1,258 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{kjartansson18_sltu, + author={Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha}, + title={{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}}, + year=2018, + booktitle={Proc. 6th Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU 2018)}, + pages={52--55}, + doi={10.21437/SLTU.2018-11} +} +""" + +_DATASETNAME = "openslr" + +_DESCRIPTION = """\ +This data set contains transcribed high-quality audio of Javanese, Sundanese, Burmese, Khmer. This data set\ +come from 3 different projects under OpenSLR initiative +""" + +_HOMEPAGE = "https://www.openslr.org/resources.php" + +_LANGUAGES = ["mya", "jav", "sun", "khm"] + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_LOCAL = False + +_RESOURCES = { + "SLR35": { + "language": "jav", + "files": [ + "asr_javanese_0.zip", + "asr_javanese_1.zip", + "asr_javanese_2.zip", + "asr_javanese_3.zip", + "asr_javanese_4.zip", + "asr_javanese_5.zip", + "asr_javanese_6.zip", + "asr_javanese_7.zip", + "asr_javanese_8.zip", + "asr_javanese_9.zip", + "asr_javanese_a.zip", + "asr_javanese_b.zip", + "asr_javanese_c.zip", + "asr_javanese_d.zip", + "asr_javanese_e.zip", + "asr_javanese_f.zip", + ], + "index_files": ["asr_javanese/utt_spk_text.tsv"] * 16, + "data_dirs": ["asr_javanese/data"] * 16, + }, + "SLR36": { + "language": "sun", + "files": [ + "asr_sundanese_0.zip", + "asr_sundanese_1.zip", + "asr_sundanese_2.zip", + "asr_sundanese_3.zip", + "asr_sundanese_4.zip", + "asr_sundanese_5.zip", + "asr_sundanese_6.zip", + "asr_sundanese_7.zip", + "asr_sundanese_8.zip", + "asr_sundanese_9.zip", + "asr_sundanese_a.zip", + "asr_sundanese_b.zip", + "asr_sundanese_c.zip", + "asr_sundanese_d.zip", + "asr_sundanese_e.zip", + "asr_sundanese_f.zip", + ], + "index_files": ["asr_sundanese/utt_spk_text.tsv"] * 16, + "data_dirs": ["asr_sundanese/data"] * 16, + }, + "SLR41": { + "language": "jav", + "files": ["jv_id_female.zip", "jv_id_male.zip"], + "index_files": ["jv_id_female/line_index.tsv", "jv_id_male/line_index.tsv"], + "data_dirs": ["jv_id_female/wavs", "jv_id_male/wavs"], + }, + "SLR42": { + "language": "khm", + "files": ["km_kh_male.zip"], + "index_files": ["km_kh_male/line_index.tsv"], + "data_dirs": ["km_kh_male/wavs"], + }, + "SLR44": { + "language": "sun", + "files": ["su_id_female.zip", "su_id_male.zip"], + "index_files": ["su_id_female/line_index.tsv", "su_id_male/line_index.tsv"], + "data_dirs": ["su_id_female/wavs", "su_id_male/wavs"], + }, + "SLR80": { + "language": "mya", + "files": ["my_mm_female.zip"], + "index_files": ["line_index.tsv"], + "data_dirs": [""], + }, +} +_URLS = {_DATASETNAME: "https://openslr.org/resources/{subset}"} + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class OpenSLRDataset(datasets.GeneratorBasedBuilder): + """This data set contains transcribed high-quality audio of Javanese, Sundanese, Burmese, Khmer. This data set + come from 3 different projects under OpenSLR initiative""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig(name=f"{_DATASETNAME}_{subset}_{_RESOURCES[subset]['language']}_source", version=datasets.Version(_SOURCE_VERSION), description=f"{_DATASETNAME} source schema", schema="source", subset_id=f"{_DATASETNAME}") + for subset in _RESOURCES.keys() + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_{_RESOURCES[subset]['language']}_seacrowd_sptext", version=datasets.Version(_SEACROWD_VERSION), description=f"{_DATASETNAME} SEACrowd schema", schema="seacrowd_sptext", subset_id=f"{_DATASETNAME}" + ) + for subset in _RESOURCES.keys() + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_SLR41_jav_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=48_000), + "sentence": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_sptext": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + subset = self.config.name.split("_")[1] + urls = [f"{_URLS[_DATASETNAME].format(subset=subset[3:])}/{file}" for file in _RESOURCES[subset]["files"]] + data_dir = dl_manager.download_and_extract(urls) + + path_to_indexs = [os.path.join(path, f"{_RESOURCES[subset]['index_files'][i]}") for i, path in enumerate(data_dir)] + path_to_datas = [os.path.join(path, f"{_RESOURCES[subset]['data_dirs'][i]}") for i, path in enumerate(data_dir)] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": [path_to_indexs, path_to_datas], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + subset = self.config.name.split("_")[1] + path_to_indexs, path_to_datas = filepath[0], filepath[1] + counter = -1 + if subset in ["SLR35", "SLR36"]: + sentence_index = {} + for i, path_to_index in enumerate(path_to_indexs): + with open(path_to_index, encoding="utf-8") as f: + lines = f.readlines() + for id_, line in enumerate(lines): + field_values = re.split(r"\t\t?", line.strip()) + filename, user_id, sentence = field_values + sentence_index[filename] = sentence + for path_to_data in sorted(Path(path_to_datas[i]).rglob("*.flac")): + filename = path_to_data.stem + if path_to_data.stem not in sentence_index: + continue + path = str(path_to_data.resolve()) + sentence = sentence_index[filename] + counter += 1 + if self.config.schema == "source": + example = {"path": path, "audio": path, "sentence": sentence} + elif self.config.schema == "seacrowd_sptext": + example = { + "id": counter, + "path": path, + "audio": path, + "text": sentence, + "speaker_id": user_id, + "metadata": { + "speaker_age": None, + "speaker_gender": None, + }, + } + yield counter, example + else: + for i, path_to_index in enumerate(path_to_indexs): + geneder = "female" if "female" in path_to_index else "male" + with open(path_to_index, encoding="utf-8") as f: + lines = f.readlines() + for id_, line in enumerate(lines): + # Following regexs are needed to normalise the lines, since the datasets + # are not always consistent and have bugs: + line = re.sub(r"\t[^\t]*\t", "\t", line.strip()) + field_values = re.split(r"\t\t?", line) + if len(field_values) != 2: + continue + filename, sentence = field_values + path = os.path.join(path_to_datas[i], f"{filename}.wav") + counter += 1 + if self.config.schema == "source": + example = {"path": path, "audio": path, "sentence": sentence} + elif self.config.schema == "seacrowd_sptext": + example = { + "id": counter, + "path": path, + "audio": path, + "text": sentence, + "speaker_id": None, + "metadata": { + "speaker_age": None, + "speaker_gender": geneder, + }, + } + yield counter, example diff --git a/seacrowd/sea_datasets/openvivqa/__init__.py b/seacrowd/sea_datasets/openvivqa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/openvivqa/openvivqa.py b/seacrowd/sea_datasets/openvivqa/openvivqa.py new file mode 100644 index 000000000..964bbe83b --- /dev/null +++ b/seacrowd/sea_datasets/openvivqa/openvivqa.py @@ -0,0 +1,162 @@ +# coding=utf-8 +import json +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{tran2021vivqa, + title={ViVQA: Vietnamese visual question answering}, + author={Tran, Khanh Quoc and Nguyen, An Trong and Le, An Tran-Hoai and Van Nguyen, Kiet}, + booktitle={Proceedings of the 35th Pacific Asia Conference on Language, Information and Computation}, + pages={683--691}, + year={2021} +} +""" +_DATASETNAME = "openvivqa" +_DESCRIPTION = """\ +OpenViVQA (Open-domain Vietnamese Visual Question Answering) is a dataset for VQA (Visual Question Answering) with +open-ended answers in Vietnamese. It consisted of 11199 images associated with 37914 question-answer pairs (QAs). +Images in the OpenViVQA dataset are captured in Vietnam and question-answer pairs are created manually by Vietnamese +crowd workers. +""" +_HOMEPAGE = "https://huggingface.co/datasets/uitnlp/OpenViVQA-dataset" +_LANGUAGES = ["vie"] +_LICENSE = Licenses.MIT.value +_LOCAL = False +_HF_URL = "https://huggingface.co/datasets/uitnlp/OpenViVQA-dataset" +_URLS = { + "dataset": { + "train": "https://huggingface.co/datasets/uitnlp/OpenViVQA-dataset/raw/main/vlsp2023_train_data.json", + "test": "https://huggingface.co/datasets/uitnlp/OpenViVQA-dataset/raw/main/vlsp2023_test_data.json", + "dev": "https://huggingface.co/datasets/uitnlp/OpenViVQA-dataset/raw/main/vlsp2023_dev_data.json", + }, + "images": { + "train": "https://huggingface.co/datasets/uitnlp/OpenViVQA-dataset/resolve/main/train-images.zip?download=true", + "test": "https://huggingface.co/datasets/uitnlp/OpenViVQA-dataset/resolve/main/test-images.zip?download=true", + "dev": "https://huggingface.co/datasets/uitnlp/OpenViVQA-dataset/resolve/main/dev-images.zip?download=true", + }, +} +_SUPPORTED_TASKS = [Tasks.VISUAL_QUESTION_ANSWERING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class OpenViVQADataset(datasets.GeneratorBasedBuilder): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_imqa", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_imqa", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"img_path": datasets.Value("string"), + "question": datasets.Value("string"), + "answer": datasets.Value("string"), + "id": datasets.Value("string")}) + elif self.config.schema == "seacrowd_imqa": + features = schemas.imqa_features + # features["meta"] = {"image_path": datasets.Value("string")} + else: + raise ValueError(f"No schema matched for {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_dir = dl_manager.download_and_extract(_URLS["dataset"]) + image_dir = dl_manager.download_and_extract(_URLS["images"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir["train"], + "imagepath": os.path.join(image_dir["train"], "training-images"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir["test"], + "imagepath": os.path.join(image_dir["test"], "test-images"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir["dev"], + "imagepath": os.path.join(image_dir["dev"], "dev-images"), + "split": "validation", + }, + ), + ] + + def _generate_examples(self, filepath: Path, imagepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + raw_examples = json.load(open(filepath, "r")) + images = raw_examples["images"] + data_annotations = raw_examples["annotations"] + for sample_id, q_key in enumerate(list(data_annotations.keys())): + quest_id = q_key + sample = data_annotations[q_key] + sample_img_id = sample["image_id"] + sample_img_name = images[str(sample_img_id)] + sample_img_path = os.path.join(imagepath, sample_img_name) + sample_question = sample["question"] + sample_answer = sample["answer"] + if self.config.schema == "source": + example = { + "img_path": sample_img_path, + "question": sample_question, + "answer": sample_answer, + "id": quest_id, + } + elif self.config.schema == "seacrowd_imqa": + example = { + "id": q_key, + "question_id": q_key, + "document_id": q_key, + "questions": [sample_question], + "type": None, + "choices": None, + "context": sample_img_id, + "answer": [sample_answer], + "image_paths": [sample_img_path], + "meta": {}, + } + yield sample_id, example diff --git a/seacrowd/sea_datasets/orchid_pos/__init__.py b/seacrowd/sea_datasets/orchid_pos/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/orchid_pos/orchid_pos.py b/seacrowd/sea_datasets/orchid_pos/orchid_pos.py new file mode 100644 index 000000000..e18f5924e --- /dev/null +++ b/seacrowd/sea_datasets/orchid_pos/orchid_pos.py @@ -0,0 +1,272 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{sornlertlamvanich1999building, + title={Building a Thai part-of-speech tagged corpus (ORCHID)}, + author={Sornlertlamvanich, Virach and Takahashi, Naoto and Isahara, Hitoshi}, + journal={Journal of the Acoustical Society of Japan (E)}, + volume={20}, + number={3}, + pages={189--198}, + year={1999}, + publisher={Acoustical Society of Japan} +} +""" + +_DATASETNAME = "orchid_pos" + +_DESCRIPTION = """\ +The ORCHID corpus is a Thai part-of-speech (POS) tagged dataset, resulting from a collaboration between\ +Japan's Communications Research Laboratory (CRL) and Thailand's National Electronics and Computer Technology\ +Center (NECTEC). It is structured at three levels: paragraph, sentence, and word. The dataset incorporates a\ +unique tagset designed for use in multi-lingual machine translation projects, and is tailored to address the\ +challenges of Thai text, which lacks explicit word and sentence boundaries, punctuation, and inflection.\ +This dataset includes text information along with numbering for retrieval, and employs a probabilistic trigram\ +model for word segmentation and POS tagging. The ORCHID corpus is specifically structured to reduce ambiguity in\ +POS assignments, making it a valuable resource for Thai language processing and computational linguistics research. +""" + +_HOMEPAGE = "https://github.com/wannaphong/corpus_mirror/releases/tag/orchid-v1.0" + +_LANGUAGES = ["tha"] + +_LICENSE = Licenses.CC_BY_NC_SA_3_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/wannaphong/corpus_mirror/releases/download/orchid-v1.0/orchid97.crp.utf", +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class OrchidPOSDataset(datasets.GeneratorBasedBuilder): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + label_names = [ + "NPRP", + "NCNM", + "NONM", + "NLBL", + "NCMN", + "NTTL", + "PPRS", + "PDMN", + "PNTR", + "PREL", + "VACT", + "VSTA", + "VATT", + "XVBM", + "XVAM", + "XVMM", + "XVBB", + "XVAE", + "DDAN", + "DDAC", + "DDBQ", + "DDAQ", + "DIAC", + "DIBQ", + "DIAQ", + "DCNM", + "DONM", + "ADVN", + "ADVI", + "ADVP", + "ADVS", + "CNIT", + "CLTV", + "CMTR", + "CFQC", + "CVBL", + "JCRG", + "JCMP", + "JSBR", + "RPRE", + "INT", + "FIXN", + "FIXV", + "EAFF", + "EITT", + "NEG", + "PUNC", + "CMTR@PUNC", + ] + if self.config.schema == "source": + features = datasets.Features( + { + "ttitle": datasets.Value("string"), + "etitle": datasets.Value("string"), + "tauthor": datasets.Value("string"), + "eauthor": datasets.Value("string"), + "tinbook": datasets.Value("string"), + "einbook": datasets.Value("string"), + "tpublisher": datasets.Value("string"), + "epublisher": datasets.Value("string"), + "year": datasets.Value("string"), + "file": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "labels": datasets.Sequence(datasets.ClassLabel(names=label_names)), + } + ) + + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(label_names) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, ""), + "split": "train", + }, + ) + ] + + def _get_tokens_labels(self, paragraphs): + tokens = [] + labels = [] + token_mapping = { + "": " ", + "": "!", + "": '"', + "": "#", + "": "$", + "": "%", + "": "&", + "": "'", + "": "/", + "": ":", + "": ";", + "": "<", + "": "=", + "": ">", + "": "?", + "": "@", + "": "(", + "": "[", + "": ")", + "": "]", + "": "*", + "": "^", + "": "+", + "": "_", + "": ",", + "left_curly_bracket": "{", + "": "-", + "": "}", + "": ".", + "": "~", + } + for paragraph in paragraphs: + sentences = re.split(r"#\d+\n", paragraph) + for sentence in sentences[1:]: + token_pos_pairs = sentence.split("//")[1] + for token_pos_pair in token_pos_pairs.split("\n")[1:-1]: + if "/" in token_pos_pair: + token = token_pos_pair.split("/")[0] + tokens.append(token_mapping[token] if token in token_mapping.keys() else token) + labels.append(token_pos_pair.split("/")[1]) + else: + token = token_pos_pair.split("@")[0] + tokens.append(token_mapping[token] if token in token_mapping.keys() else token) + labels.append(token_pos_pair.split("@")[1]) + return tokens, labels + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + file_content = open(filepath, "r").read() + texts = file_content.split("%TTitle:") + + idx = 0 + for text in texts[1:]: + file_part = text.split("%File")[-1] + tokens, labels = self._get_tokens_labels(re.split(r"#P\d+\n", file_part)[1:]) + if self.config.schema == "source": + parts = text.split("%") + example = { + "ttitle": parts[0], + "etitle": ":".join(parts[1].split(":")[1:]).strip(), + "tauthor": ":".join(parts[2].split(":")[1:]).strip(), + "eauthor": ":".join(parts[3].split(":")[1:]).strip(), + "tinbook": ":".join(parts[4].split(":")[1:]).strip(), + "einbook": ":".join(parts[5].split(":")[1:]).strip(), + "tpublisher": ":".join(parts[6].split(":")[1:]).strip(), + "epublisher": ":".join(parts[7].split(":")[1:]).strip(), + "year": ":".join(parts[9].split(":")[1:]).strip(), + "file": file_part.strip(), + "tokens": tokens, + "labels": labels, + } + elif self.config.schema == "seacrowd_seq_label": + example = { + "id": idx, + "tokens": tokens, + "labels": labels, + } + yield idx, example + idx += 1 diff --git a/seacrowd/sea_datasets/oscar_2201/oscar_2201.py b/seacrowd/sea_datasets/oscar_2201/oscar_2201.py index 0b78a445f..89fd66348 100644 --- a/seacrowd/sea_datasets/oscar_2201/oscar_2201.py +++ b/seacrowd/sea_datasets/oscar_2201/oscar_2201.py @@ -216,6 +216,9 @@ _LICENSE = Licenses.CC0_1_0.value _BASE_URL = "https://huggingface.co/datasets/oscar-corpus/OSCAR-2201/resolve/main/compressed/{lang}_meta/" +_LOCAL = False +_LANGUAGES = ["war", "ceb", "min", "vie", "ilo", "tgl", "lao", "khm", "mya", "jav", "ind", "tha", "sun", "zlm"] + _SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] _SOURCE_VERSION = "2022.1.0" _SEACROWD_VERSION = "1.0.0" diff --git a/seacrowd/sea_datasets/palito/__init__.py b/seacrowd/sea_datasets/palito/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/palito/palito.py b/seacrowd/sea_datasets/palito/palito.py new file mode 100644 index 000000000..20fc0409a --- /dev/null +++ b/seacrowd/sea_datasets/palito/palito.py @@ -0,0 +1,160 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, + DEFAULT_SOURCE_VIEW_NAME, Licenses, + Tasks) + +_DATASETNAME = "palito" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME + +_CITATION = """ +@inproceedings{dita-etal-2009-building, + title = "Building Online Corpora of {P}hilippine Languages", + author = "Dita, Shirley N. and + Roxas, Rachel Edita O. and + Inventado, Paul", + editor = "Kwong, Olivia", + booktitle = "Proceedings of the 23rd Pacific Asia Conference on Language, Information and Computation, Volume 2", + month = dec, + year = "2009", + address = "Hong Kong", + publisher = "City University of Hong Kong", + url = "https://aclanthology.org/Y09-2024", + pages = "646--653", +} +""" + +# We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LANGUAGES = ["bik", "ceb", "hil", "ilo", "tgl", "pam", "pag", "war"] +_LANG_CONFIG = { + "bik": "Bikol", + "ceb": "Cebuano", + "hil": "Hiligaynon", + "ilo": "Ilocano", + "tgl": "Tagalog", + "pam": "Kapampangan", + "pag": "Pangasinense", + "war": "Waray", +} + +_LOCAL = False + +_DESCRIPTION = """\ +This paper aims at describing the building of the online corpora on Philippine +languages as part of the online repository system called Palito. There are five components +of the corpora: the top four major Philippine languages which are Tagalog, Cebuano, +Ilocano and Hiligaynon and the Filipino Sign Language (FSL). The four languages are +composed of 250,000-word written texts each, whereas the FSL is composed of seven +thousand signs in video format. Categories of the written texts include creative writing (such +as novels and stories) and religious texts (such as the Bible). Automated tools are provided +for language analysis such as word count, collocates, and others. This is part of a bigger +corpora building project for Philippine languages that would consider text, speech and +video forms, and the corresponding development of automated tools for language analysis +of these various forms. +""" + +_HOMEPAGE = "https://github.com/imperialite/Philippine-Languages-Online-Corpora/tree/master/PALITO%20Corpus" + +_LICENSE = Licenses.LGPL.value + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_URLS = { + "literary": "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/PALITO%20Corpus/Data/{lang}_Literary_Text.txt", + "religious": "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/PALITO%20Corpus/Data/{lang}_Religious_Text.txt", +} + + +class PalitoDataset(datasets.GeneratorBasedBuilder): + """Palito corpus""" + + subsets = [f"{_DATASETNAME}_{lang}" for lang in _LANGUAGES] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="{sub}_source".format(sub=subset), + version=datasets.Version(_SOURCE_VERSION), + description="Palito {sub} source schema".format(sub=subset), + schema="source", + subset_id="{sub}".format(sub=subset), + ) + for subset in subsets + ] + [ + SEACrowdConfig( + name="{sub}_seacrowd_ssp".format(sub=subset), + version=datasets.Version(_SEACROWD_VERSION), + description="Palito {sub} SEACrowd schema".format(sub=subset), + schema="seacrowd_ssp", + subset_id="{sub}".format(sub=subset), + ) + for subset in subsets + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_ssp": + features = schemas.self_supervised_pretraining.features + else: + raise ValueError(f"Invalid config schema: {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + lang = self.config.name.split("_")[1] + filepaths = [Path(dl_manager.download(_URLS["literary"].format(lang=_LANG_CONFIG[lang]))), Path(dl_manager.download(_URLS["religious"].format(lang=_LANG_CONFIG[lang])))] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepaths": filepaths}, + ), + ] + + def _generate_examples(self, filepaths: list[Path]) -> Tuple[int, Dict]: + counter = 0 + for path in filepaths: + with open(path, encoding="utf-8") as f: + for line in f.readlines(): + if line.strip() == "": + continue + + if self.config.schema == "source": + yield ( + counter, + { + "id": str(counter), + "text": line.strip(), + }, + ) + elif self.config.schema == "seacrowd_ssp": + yield ( + counter, + { + "id": str(counter), + "text": line.strip(), + }, + ) + + counter += 1 diff --git a/seacrowd/sea_datasets/ph_fake_news_corpus/__init__.py b/seacrowd/sea_datasets/ph_fake_news_corpus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/ph_fake_news_corpus/ph_fake_news_corpus.py b/seacrowd/sea_datasets/ph_fake_news_corpus/ph_fake_news_corpus.py new file mode 100644 index 000000000..a11a420c3 --- /dev/null +++ b/seacrowd/sea_datasets/ph_fake_news_corpus/ph_fake_news_corpus.py @@ -0,0 +1,115 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproceedings{hernandez-devaraj-2019-phfakenews, + author = {Fernandez, Aaron Carl T. and Devaraj, Madhavi}, + title = {Computing the Linguistic-Based Cues of Fake News in the Philippines Towards its Detection}, + booktitle = {Proceedings of the 9th International Conference on Web Intelligence, Mining and Semantics}, + publisher = {Association for Computing Machinery}, + year = {2019}, + url = {https://dl.acm.org/doi/abs/10.1145/3326467.3326490}, + doi = {10.1145/3326467.3326490}, + pages = {1-9}, +} +""" + +_LOCAL = False +_LANGUAGES = ["eng"] +_DATASETNAME = "ph_fake_news_corpus" +_DESCRIPTION = """ +The Philippine Fake News Corpus consists of news headlines and content from various "credible" and "non-credible" +national news outlets. "Credible" sources were national broadsheets available in the National Library of the +Philippines, while "non-credible" sources were sources included in lists of websites with fake or unverified content +provided by government and private institutions. +""" + +_HOMEPAGE = "https://github.com/aaroncarlfernandez/Philippine-Fake-News-Corpus" +_LICENSE = Licenses.UNKNOWN.value +_URL = "https://github.com/aaroncarlfernandez/Philippine-Fake-News-Corpus/raw/master/Philippine%20Fake%20News%20Corpus.zip/" + +_SUPPORTED_TASKS = [Tasks.FACT_CHECKING] +_SOURCE_VERSION = "1.0.0" + + +class PhilippineFakeNewsDataset(datasets.GeneratorBasedBuilder): + """ + Dataset of English news articles from the Philippines manually annotated as "credible" or + "non-credible" based on source. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + features = datasets.Features( + { + "Headline": datasets.Value("string"), + "Content": datasets.Value("string"), + "Authors": datasets.Value("string"), + "Date": datasets.Value("string"), + "URL": datasets.Value("string"), + "Brand": datasets.Value("string"), + "Label": datasets.Value("string"), + } + ) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_dir = dl_manager.download_and_extract(_URL) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "Philippine Fake News Corpus.csv"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + df = pd.read_csv(filepath, index_col=None, header="infer", encoding="utf-8") + for index, example in df.iterrows(): + yield index, example.to_dict() \ No newline at end of file diff --git a/seacrowd/sea_datasets/pho_ner_covid/__init__.py b/seacrowd/sea_datasets/pho_ner_covid/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/pho_ner_covid/pho_ner_covid.py b/seacrowd/sea_datasets/pho_ner_covid/pho_ner_covid.py new file mode 100644 index 000000000..7978401b4 --- /dev/null +++ b/seacrowd/sea_datasets/pho_ner_covid/pho_ner_covid.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = """\ +@inproceedings{PhoNER_COVID19, +title = {{COVID-19 Named Entity Recognition for Vietnamese}}, +author = {Thinh Hung Truong and Mai Hoang Dao and Dat Quoc Nguyen}, +booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, +year = {2021} +} +""" + +_DATASETNAME = "pho_ner_covid" + +_DESCRIPTION = """\ +A named entity recognition dataset for Vietnamese with 10 newly-defined entity types in the context of the COVID-19 pandemic. +Data is extracted from news articles and manually annotated. In total, there are 34 984 entities over 10 027 sentences. +""" + +_HOMEPAGE = "https://github.com/VinAIResearch/PhoNER_COVID19/tree/main" + +_LANGUAGES = ["vie"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + "word_level": { + "dev": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/word/dev_word.json", + "train": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/word/train_word.json", + "test": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/word/test_word.json", + }, + "syllable_level": { + "dev": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/dev_syllable.json", + "train": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/train_syllable.json", + "test": "https://raw.githubusercontent.com/VinAIResearch/PhoNER_COVID19/main/data/syllable/test_syllable.json", + }, + } +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] +_SUPPORTED_SCHEMA_STRINGS = [f"seacrowd_{str(TASK_TO_SCHEMA[task]).lower()}" for task in _SUPPORTED_TASKS] + +_SUPPORTED_SCHEMA_STRING_MAP: Dict[Tasks, str] = {} + +for task, schema_string in zip(_SUPPORTED_TASKS, _SUPPORTED_SCHEMA_STRINGS): + _SUPPORTED_SCHEMA_STRING_MAP[task] = schema_string + +_SUBSETS = ["word_level", "syllable_level"] +_SPLITS = ["train", "dev", "test"] +_TAGS = [ + "O", + "B-ORGANIZATION", + "I-ORGANIZATION", + "B-SYMPTOM_AND_DISEASE", + "I-SYMPTOM_AND_DISEASE", + "B-LOCATION", + "B-DATE", + "B-PATIENT_ID", + "B-AGE", + "B-NAME", + "I-DATE", + "B-JOB", + "I-LOCATION", + "B-TRANSPORTATION", + "B-GENDER", + "I-TRANSPORTATION", + "I-JOB", + "I-NAME", + "I-AGE", + "I-PATIENT_ID", + "I-GENDER", +] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class PhoNerCovidDataset(datasets.GeneratorBasedBuilder): + """A named entity recognition dataset for Vietnamese with 10 newly-defined entity types in the context of the COVID-19 pandemic.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [] + + for subset_id in _SUBSETS: + BUILDER_CONFIGS.append( + SEACrowdConfig( + name=f"{subset_id}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=subset_id, + ) + ) + + seacrowd_schema_config: list[SEACrowdConfig] = [] + + for seacrowd_schema in _SUPPORTED_SCHEMA_STRINGS: + + seacrowd_schema_config.append( + SEACrowdConfig( + name=f"{subset_id}_{seacrowd_schema}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {seacrowd_schema} schema", + schema=f"{seacrowd_schema}", + subset_id=subset_id, + ) + ) + + BUILDER_CONFIGS.extend(seacrowd_schema_config) + + DEFAULT_CONFIG_NAME = f"{_SUBSETS[0]}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "words": datasets.Sequence(datasets.Value("string")), + "tags": datasets.Sequence(datasets.ClassLabel(names=_TAGS)), + } + ) + + elif self.config.schema == _SUPPORTED_SCHEMA_STRING_MAP[Tasks.NAMED_ENTITY_RECOGNITION]: + features = schemas.seq_label_features(label_names=_TAGS) + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + split_generators = [] + + for split in _SPLITS: + path = dl_manager.download_and_extract(_URLS[_DATASETNAME][self.config.subset_id][split]) + + split_generators.append( + datasets.SplitGenerator( + name=split, + gen_kwargs={ + "path": path, + }, + ) + ) + + return split_generators + + def _generate_examples(self, path: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + idx = 0 + df = pd.read_json(path, lines=True) + + if self.config.schema == "source": + for _, row in df.iterrows(): + yield idx, row.to_dict() + idx += 1 + + elif self.config.schema == _SUPPORTED_SCHEMA_STRING_MAP[Tasks.NAMED_ENTITY_RECOGNITION]: + df["id"] = df.index + df = df.rename(columns={"words": "tokens", "tags": "labels"}) + + for _, row in df.iterrows(): + yield idx, row.to_dict() + idx += 1 + + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/phoatis/intent_label.txt b/seacrowd/sea_datasets/phoatis/intent_label.txt new file mode 100644 index 000000000..bc6f8524f --- /dev/null +++ b/seacrowd/sea_datasets/phoatis/intent_label.txt @@ -0,0 +1,29 @@ +UNK +abbreviation +aircraft +aircraft#flight +aircraft#flight#flight_no +airfare +airfare#flight +airfare#flight_time +airline +airline#flight +airline#flight_no +airport +capacity +city +city#flight_time +distance +flight +flight#flight_no +flight#flight_time +flight_no +flight_no#flight_time +flight_time +ground_fare +ground_fare#ground_service +ground_service +meal +quantity +restriction +day_name diff --git a/seacrowd/sea_datasets/phoatis/phoatis.py b/seacrowd/sea_datasets/phoatis/phoatis.py new file mode 100644 index 000000000..6524711b8 --- /dev/null +++ b/seacrowd/sea_datasets/phoatis/phoatis.py @@ -0,0 +1,239 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@article{dao2021intent, + title={Intent Detection and Slot Filling for Vietnamese}, + author={Mai Hoang Dao and Thinh Hung Truong and Dat Quoc Nguyen}, + year={2021}, + eprint={2104.02021}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_DATASETNAME = "phoatis" + +_DESCRIPTION = """\ +This is first public intent detection and slot filling dataset for Vietnamese. The data contains 5871 English utterances from ATIS that are manually translated by professional translators into Vietnamese. +""" + +_HOMEPAGE = "https://github.com/VinAIResearch/JointIDSF/" + +_LICENSE = Licenses.UNKNOWN.value + +_URLS = { + _DATASETNAME: { + "syllable": { + "syllable_train": [ + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/seq.in", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/seq.out", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/label", + ], + "syllable_dev": [ + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/seq.in", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/seq.out", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/label", + ], + "syllable_test": [ + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/seq.in", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/seq.out", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/label", + ], + }, + "word": { + "word_train": [ + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/seq.in", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/seq.out", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/label", + ], + "word_dev": [ + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/seq.in", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/seq.out", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/label", + ], + "word_test": [ + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/seq.in", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/seq.out", + "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/label", + ], + }, + } +} + +_LOCAL = False +_LANGUAGES = ["vie"] + +_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION, Tasks.SLOT_FILLING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +def config_constructor_intent_cls(schema: str, version: str, phoatis_subset: str = "syllable") -> SEACrowdConfig: + assert phoatis_subset == "syllable" or phoatis_subset == "word" + + return SEACrowdConfig( + name="phoatis_intent_cls_{phoatis_subset}_{schema}".format(phoatis_subset=phoatis_subset.lower(), schema=schema), + version=version, + description="PhoATIS Intent Classification: {subset} {schema} schema".format(subset=phoatis_subset, schema=schema), + schema=schema, + subset_id=phoatis_subset, + ) + + +def config_constructor_slot_filling(schema: str, version: str, phoatis_subset: str = "syllable") -> SEACrowdConfig: + assert phoatis_subset == "syllable" or phoatis_subset == "word" + + return SEACrowdConfig( + name="phoatis_slot_filling_{phoatis_subset}_{schema}".format(phoatis_subset=phoatis_subset.lower(), schema=schema), + version=version, + description="PhoATIS Slot Filling: {subset} {schema} schema".format(subset=phoatis_subset, schema=schema), + schema=schema, + subset_id=phoatis_subset, + ) + + +class PhoATIS(datasets.GeneratorBasedBuilder): + """This is first public intent detection and slot filling dataset for Vietnamese. The data contains 5871 English utterances from ATIS that are manually translated by professional translators into Vietnamese.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + # BUILDER_CONFIGS = [config_constructor_intent_cls("source", _SOURCE_VERSION, subset) for subset in ["syllable", "word"]] + BUILDER_CONFIGS = [] + BUILDER_CONFIGS.extend([config_constructor_intent_cls("seacrowd_text", _SEACROWD_VERSION, subset) for subset in ["syllable", "word"]]) + # BUILDER_CONFIGS.extend([config_constructor_slot_filling("source", _SOURCE_VERSION, subset) for subset in ["syllable", "word"]]) + BUILDER_CONFIGS.extend([config_constructor_slot_filling("seacrowd_seq_label", _SEACROWD_VERSION, subset) for subset in ["syllable", "word"]]) + + BUILDER_CONFIGS.extend( + [ # Default config + SEACrowdConfig( + name="phoatis_source", + version=SOURCE_VERSION, + description="PhoATIS source schema (Syllable version)", + schema="source", + subset_id="syllable", + ), + SEACrowdConfig( + name="phoatis_intent_cls_seacrowd_text", + version=SEACROWD_VERSION, + description="PhoATIS Intent Classification SEACrowd schema (Syllable version)", + schema="seacrowd_text", + subset_id="syllable", + ), + SEACrowdConfig( + name="phoatis_slot_filling_seacrowd_seq_label", + version=SEACROWD_VERSION, + description="PhoATIS Slot Filling SEACrowd schema (Syllable version)", + schema="seacrowd_seq_label", + subset_id="syllable", + ), + ] + ) + + DEFAULT_CONFIG_NAME = "phoatis_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + "intent_label": datasets.Value("string"), + "slot_label": datasets.Sequence(datasets.Value("string")), + } + ) + + elif self.config.schema == "seacrowd_text": + with open("./seacrowd/sea_datasets/phoatis/intent_label.txt", "r+", encoding="utf8") as fw: + intent_label = fw.read() + intent_label = intent_label.split("\n") + features = schemas.text_features(intent_label) + + elif self.config.schema == "seacrowd_seq_label": + with open("./seacrowd/sea_datasets/phoatis/slot_label.txt", "r+", encoding="utf8") as fw: + slot_label = fw.read() + slot_label = slot_label.split("\n") + features = schemas.seq_label_features(slot_label) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + schema = self.config.subset_id + urls = _URLS[_DATASETNAME][schema] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir[f"{schema}_train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir[f"{schema}_test"], + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir[f"{schema}_dev"], + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + with open(filepath[0], "r+", encoding="utf8") as fw: + data_input = fw.read() + data_input = data_input.split("\n") + with open(filepath[1], "r+", encoding="utf8") as fw: + data_slot = fw.read() + data_slot = data_slot.split("\n") + with open(filepath[2], "r+", encoding="utf8") as fw: + data_intent = fw.read() + data_intent = data_intent.split("\n") + + if self.config.schema == "source": + for idx, text in enumerate(data_input): + example = {} + example["id"] = str(idx) + example["text"] = text + example["intent_label"] = data_intent[idx] + example["slot_label"] = data_slot[idx].split() + yield example["id"], example + + elif self.config.schema == "seacrowd_text": + for idx, text in enumerate(data_input): + example = {} + example["id"] = str(idx) + example["text"] = text + example["label"] = data_intent[idx] + yield example["id"], example + + elif self.config.schema == "seacrowd_seq_label": + for idx, text in enumerate(data_input): + example = {} + example["id"] = str(idx) + example["tokens"] = text.split() + example["labels"] = data_slot[idx].split() + yield example["id"], example diff --git a/seacrowd/sea_datasets/phoatis/slot_label.txt b/seacrowd/sea_datasets/phoatis/slot_label.txt new file mode 100644 index 000000000..06411d630 --- /dev/null +++ b/seacrowd/sea_datasets/phoatis/slot_label.txt @@ -0,0 +1,150 @@ +PAD +UNK +O +B-aircraft_code +I-aircraft_code +B-airline_code +B-airline_name +I-airline_name +B-airport_code +B-airport_name +I-airport_name +B-arrive_date.date_relative +I-arrive_date.date_relative +B-arrive_date.day_name +I-arrive_date.day_name +B-arrive_date.day_number +I-arrive_date.day_number +B-arrive_date.month_name +I-arrive_date.month_name +B-arrive_date.today_relative +B-arrive_time.end_time +I-arrive_time.end_time +B-arrive_time.period_mod +I-arrive_time.period_mod +B-arrive_time.period_of_day +I-arrive_time.period_of_day +B-arrive_time.start_time +I-arrive_time.start_time +B-arrive_time.time +I-arrive_time.time +B-arrive_time.time_relative +I-arrive_time.time_relative +B-city_name +I-city_name +B-class_type +I-class_type +B-connect +I-connect +B-cost_relative +I-cost_relative +B-day_name +I-day_name +B-day_number +I-day_number +B-days_code +B-depart_date.date_relative +I-depart_date.date_relative +B-depart_date.day_name +I-depart_date.day_name +B-depart_date.day_number +I-depart_date.day_number +B-depart_date.month_name +I-depart_date.month_name +B-depart_date.today_relative +I-depart_date.today_relative +B-depart_date.year +I-depart_date.year +B-depart_time.end_time +I-depart_time.end_time +B-depart_time.period_mod +I-depart_time.period_mod +B-depart_time.period_of_day +I-depart_time.period_of_day +B-depart_time.start_time +I-depart_time.start_time +B-depart_time.time +I-depart_time.time +B-depart_time.time_relative +I-depart_time.time_relative +B-economy +I-economy +B-fare_amount +I-fare_amount +B-fare_basis_code +B-flight_days +I-flight_days +B-flight_mod +I-flight_mod +B-flight_number +I-flight_number +B-flight_stop +I-flight_stop +B-flight_time +I-flight_time +B-fromloc.airport_code +B-fromloc.airport_name +I-fromloc.airport_name +B-fromloc.city_name +I-fromloc.city_name +B-fromloc.state_code +B-fromloc.state_name +I-fromloc.state_name +B-meal +I-meal +B-meal_code +I-meal_code +B-meal_description +I-meal_description +B-mod +I-mod +B-month_name +B-or +B-period_of_day +I-period_of_day +B-restriction_code +I-restriction_code +B-return_date.date_relative +I-return_date.date_relative +B-return_date.day_name +I-return_date.day_name +B-return_date.day_number +I-return_date.day_number +B-return_date.month_name +I-return_date.month_name +B-return_date.today_relative +I-return_date.today_relative +B-return_time.period_mod +B-return_time.period_of_day +I-return_time.period_of_day +B-round_trip +I-round_trip +B-state_code +B-state_name +I-state_name +B-stoploc.airport_name +B-stoploc.city_name +I-stoploc.city_name +B-stoploc.state_code +B-time +I-time +B-time_relative +B-today_relative +I-today_relative +B-toloc.airport_code +B-toloc.airport_name +I-toloc.airport_name +B-toloc.city_name +I-toloc.city_name +B-toloc.country_name +I-toloc.country_name +B-toloc.state_code +B-toloc.state_name +I-toloc.state_name +B-transport_type +I-transport_type +B-compartment +I-compartment +B-stoploc.airport_code +B-booking_class +I-booking_class diff --git a/seacrowd/sea_datasets/phomt/__init__.py b/seacrowd/sea_datasets/phomt/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/phomt/phomt.py b/seacrowd/sea_datasets/phomt/phomt.py new file mode 100644 index 000000000..9a63725bb --- /dev/null +++ b/seacrowd/sea_datasets/phomt/phomt.py @@ -0,0 +1,139 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{PhoMT, +title = {{PhoMT: A High-Quality and Large-Scale Benchmark Dataset for Vietnamese-English Machine Translation}}, +author = {Long Doan and Linh The Nguyen and Nguyen Luong Tran and Thai Hoang and Dat Quoc Nguyen}, +booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, +year = {2021}, +pages = {4495--4503} +} +""" + +_DATASETNAME = "phomt" + + +_DESCRIPTION = """\ +PhoMT is a high-quality and large-scale Vietnamese-English parallel dataset of 3.02M sentence pairs, which is 2.9M +pairs larger than the benchmark Vietnamese-English machine translation corpus IWSLT15. This is the first large-scale +Vietnamese-English machine translation study. +""" + +_LANGUAGES = ["vie", "eng"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LOCAL = True + +_HOMEPAGE = "https://github.com/VinAIResearch/PhoMT" + +_LICENSE = Licenses.MIT.value + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +MAP_LANG = {"eng": "en", "vie": "vi"} + + +def seacrowd_config_constructor(src_lang, tgt_lang, schema, version): + if src_lang == "" or tgt_lang == "": + raise ValueError(f"Invalid src_lang {src_lang} or tgt_lang {tgt_lang}") + + if schema not in ["source", "seacrowd_t2t"]: + raise ValueError(f"Invalid schema: {schema}") + + return SEACrowdConfig( + name="phomt_{src}_{tgt}_{schema}".format(src=src_lang, tgt=tgt_lang, schema=schema), + version=datasets.Version(version), + description="phomt schema for {schema} from {src} to {tgt}".format(schema=schema, src=src_lang, tgt=tgt_lang), + schema=schema, + subset_id="phomt_{src}_{tgt}".format(src=src_lang, tgt=tgt_lang), + ) + + +class PhoMT(datasets.GeneratorBasedBuilder): + """ + PhoMT is a high-quality and large-scale Vietnamese-English parallel dataset of 3.02M sentence pairs, which is + 2.9M pairs larger than the benchmark Vietnamese-English machine translation corpus IWSLT15. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + seacrowd_config_constructor("eng", "vie", "source", _SOURCE_VERSION), + seacrowd_config_constructor("eng", "vie", "seacrowd_t2t", _SEACROWD_VERSION), + ] + + DEFAULT_CONFIG_NAME = "phomt_eng_vie_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema in ("source", "seacrowd_t2t"): + features = schemas.text2text_features + else: + raise ValueError(f"Invalid config schema: {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": os.path.join(data_dir, "detokenization", "train", "train.{lang}")}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": os.path.join(data_dir, "detokenization", "dev", "dev.{lang}")}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": os.path.join(data_dir, "detokenization", "test", "test.{lang}")}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + config_names_split = self.config.name.split("_") + src_lang = config_names_split[1] + tgt_lang = config_names_split[2] + + src_path = filepath.format(lang=MAP_LANG[src_lang]) + tgt_path = filepath.format(lang=MAP_LANG[tgt_lang]) + + with open(src_path, "r", encoding="utf8") as f: + src_lines = f.readlines() + with open(tgt_path, "r", encoding="utf8") as f: + tgt_lines = f.readlines() + + if self.config.schema in ("source", "seacrowd_t2t"): + for idx, (src_line, tgt_line) in enumerate(zip(src_lines, tgt_lines)): + ex = { + "id": str(idx), + "text_1": src_line.strip(), + "text_2": tgt_line.strip(), + "text_1_name": src_lang, + "text_2_name": tgt_lang, + } + yield idx, ex + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") diff --git a/seacrowd/sea_datasets/prdect_id/__init__.py b/seacrowd/sea_datasets/prdect_id/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/prdect_id/prdect_id.py b/seacrowd/sea_datasets/prdect_id/prdect_id.py new file mode 100644 index 000000000..4a3c590c8 --- /dev/null +++ b/seacrowd/sea_datasets/prdect_id/prdect_id.py @@ -0,0 +1,161 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@article{SUTOYO2022108554, +title = {PRDECT-ID: Indonesian product reviews dataset for emotions classification tasks}, +journal = {Data in Brief}, +volume = {44}, +pages = {108554}, +year = {2022}, +issn = {2352-3409}, +doi = {https://doi.org/10.1016/j.dib.2022.108554}, +url = {https://www.sciencedirect.com/science/article/pii/S2352340922007612}, +author = {Rhio Sutoyo and Said Achmad and Andry Chowanda and Esther Widhi Andangsari and Sani M. Isa}, +keywords = {Natural language processing, Text processing, Text mining, Emotions classification, Sentiment analysis}, +abstract = {Recognizing emotions is vital in communication. Emotions convey +additional meanings to the communication process. Nowadays, people can +communicate their emotions on many platforms; one is the product review. Product +reviews in the online platform are an important element that affects customers’ +buying decisions. Hence, it is essential to recognize emotions from the product +reviews. Emotions recognition from the product reviews can be done automatically +using a machine or deep learning algorithm. Dataset can be considered as the +fuel to model the recognizer. However, only a limited dataset exists in +recognizing emotions from the product reviews, particularly in a local language. +This research contributes to the dataset collection of 5400 product reviews in +Indonesian. It was carefully curated from various (29) product categories, +annotated with five emotions, and verified by an expert in clinical psychology. +The dataset supports an innovative process to build automatic emotion +classification on product reviews.} +} +""" + +_LOCAL = False +_LANGUAGES = ["ind"] +_DATASETNAME = "prdect_id" +_DESCRIPTION = """ +PRDECT-ID Dataset is a collection of Indonesian product review data annotated +with emotion and sentiment labels. The data were collected from one of the giant +e-commerce in Indonesia named Tokopedia. The dataset contains product reviews +from 29 product categories on Tokopedia that use the Indonesian language. Each +product review is annotated with a single emotion, i.e., love, happiness, anger, +fear, or sadness. The group of annotators does the annotation process to provide +emotion labels by following the emotions annotation criteria created by an +expert in clinical psychology. Other attributes related to the product review +are also extracted, such as Location, Price, Overall Rating, Number Sold, Total +Review, and Customer Rating, to support further research. +""" + +_HOMEPAGE = "https://data.mendeley.com/datasets/574v66hf2v/1" +_LICENSE = Licenses.CC_BY_4_0.value +_URL = "https://data.mendeley.com/public-files/datasets/574v66hf2v/files/f258d159-c678-42f1-9634-edf091a0b1f3/file_downloaded" + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS, Tasks.EMOTION_CLASSIFICATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class PrdectIDDataset(datasets.GeneratorBasedBuilder): + """PRDECT-ID Dataset""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_emotion_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_emotion", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_sentiment_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_sentiment", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_emotion_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema for emotion classification", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_emotion", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_sentiment_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema for sentiment analysis", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_sentiment", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + CLASS_LABELS_EMOTION = ["Happy", "Sadness", "Anger", "Love", "Fear"] + CLASS_LABELS_SENTIMENT = ["Positive", "Negative"] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "Category": datasets.Value("string"), + "Product Name": datasets.Value("string"), + "Location": datasets.Value("string"), + "Price": datasets.Value("int32"), + "Overall Rating": datasets.Value("float32"), + "Number Sold": datasets.Value("int32"), + "Total Review": datasets.Value("int32"), + "Customer Rating": datasets.Value("int32"), + "Customer Review": datasets.Value("string"), + "Sentiment": datasets.ClassLabel(names=self.CLASS_LABELS_SENTIMENT), + "Emotion": datasets.ClassLabel(names=self.CLASS_LABELS_EMOTION), + } + ) + elif self.config.schema == "seacrowd_text": + if self.config.subset_id == f"{_DATASETNAME}_emotion": + features = schemas.text_features(label_names=self.CLASS_LABELS_EMOTION) + elif self.config.subset_id == f"{_DATASETNAME}_sentiment": + features = schemas.text_features(label_names=self.CLASS_LABELS_SENTIMENT) + else: + raise ValueError(f"Invalid subset: {self.config.subset_id}") + else: + raise ValueError(f"Schema '{self.config.schema}' is not defined.") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_file = Path(dl_manager.download(_URL)) + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_file})] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yield examples as (key, example) tuples""" + df = pd.read_csv(filepath, encoding="utf-8") + for idx, row in df.iterrows(): + if self.config.schema == "source": + yield idx, dict(row) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + if self.config.subset_id == f"{_DATASETNAME}_emotion": + yield idx, {"id": idx, "text": row["Customer Review"], "label": row["Emotion"]} + elif self.config.subset_id == f"{_DATASETNAME}_sentiment": + yield idx, {"id": idx, "text": row["Customer Review"], "label": row["Sentiment"]} + else: + raise ValueError(f"Invalid subset: {self.config.subset_id}") diff --git a/seacrowd/sea_datasets/qasina/__init__.py b/seacrowd/sea_datasets/qasina/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/qasina/qasina.py b/seacrowd/sea_datasets/qasina/qasina.py new file mode 100644 index 000000000..35bad2464 --- /dev/null +++ b/seacrowd/sea_datasets/qasina/qasina.py @@ -0,0 +1,173 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@misc{rizqullah2023qasina, +title={QASiNa: Religious Domain Question Answering using Sirah Nabawiyah}, +author={Muhammad Razif Rizqullah and Ayu Purwarianti and Alham Fikri Aji}, +year={2023}, +eprint={2310.08102}, +archivePrefix={arXiv}, +primaryClass={cs.CL} +} +""" + +_DATASETNAME = "qasina" + +_DESCRIPTION = """\ +Question Answering Sirah Nabawiyah Dataset (QASiNa) is Extractive \ +QA Dataset which build to perform QA task in Sirah Nabawiyah domain. +""" + +_HOMEPAGE = "https://github.com/rizquuula/QASiNa" + +_LANGUAGES = ["ind"] + +_LICENSE = Licenses.MIT.value +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/rizquuula/QASiNa/raw/main/QASiNa.json", +} + + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +class QasinaDataset(datasets.GeneratorBasedBuilder): + """Question Answering Sirah Nabawiyah Dataset (QASiNa) is \ + Extractive QA Dataset which build to perform QA task in Sirah Nabawiyah domain.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "qa" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "context_id": datasets.Value("int32"), + "context": datasets.Value("string"), + "question_answers": datasets.Sequence({"type": datasets.Value("string"), "question": datasets.Value("string"), "answer": datasets.Value("string"), "answer_start": datasets.Value("int32"), "question_id": datasets.Value("int32")}), + "context_length": datasets.Value("int32"), + "context_title": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.qa.features + features["meta"] = {"context_title": datasets.Value("string"), "answer_start": datasets.Value("int32"),"context_length": datasets.Value("int32"), "type": datasets.Value("string")} + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + filepath = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": filepath, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + with open(filepath) as file: + dataset = json.load(file) + + if self.config.schema == "source": + for i, line in enumerate(dataset): + yield i, { + "context_id": line["context_id"], + "context": line["context"], + "question_answers": [ + { + "type": subline["type"], + "question": subline["question"], + "answer": subline["answer"], + "answer_start": subline["answer_start"], + "question_id": subline["question_id"], + } + for subline in line["question_answers"] + ], + "context_length": line["context_length"], + "context_title": line["context_title"], + } + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for line in dataset: + for question_answer in line["question_answers"]: + id = question_answer["question_id"] + + yield id, { + "id": id, + "question_id": question_answer["question_id"], + "document_id": line["context_id"], + "question": question_answer["question"], + "type": "extractive", + "choices": [], + "context": line["context"], + "answer": [question_answer["answer"]], + "meta": { + "context_title": line["context_title"], + "answer_start": question_answer["answer_start"], + "context_length": line["context_length"], + "type": question_answer["type"], + }, + } diff --git a/seacrowd/sea_datasets/roots_vi_ted/__init__.py b/seacrowd/sea_datasets/roots_vi_ted/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/roots_vi_ted/roots_vi_ted.py b/seacrowd/sea_datasets/roots_vi_ted/roots_vi_ted.py new file mode 100644 index 000000000..3240ce8de --- /dev/null +++ b/seacrowd/sea_datasets/roots_vi_ted/roots_vi_ted.py @@ -0,0 +1,128 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproceedings{DBLP:conf/nips/LaurenconSWAMSW22, + author={Hugo Laurençon and Lucile Saulnier and Thomas Wang and Christopher Akiki and Albert Villanova del Moral and + Teven Le Scao and Leandro von Werra and Chenghao Mou and Eduardo González Ponferrada and Huu Nguyen and Jörg Frohberg + and Mario Sasko and Quentin Lhoest and Angelina McMillan-Major and Gérard Dupont and Stella Biderman and Anna Rogers + and Loubna Ben Allal and Francesco De Toni and Giada Pistilli and Olivier Nguyen and Somaieh Nikpoor and Maraim Masoud + and Pierre Colombo and Javier de la Rosa and Paulo Villegas and Tristan Thrush and Shayne Longpre and Sebastian Nagel + and Leon Weber and Manuel Muñoz and Jian Zhu and Daniel van Strien and Zaid Alyafeai and Khalid Almubarak and Minh + Chien Vu and Itziar Gonzalez-Dios and Aitor Soroa and Kyle Lo and Manan Dey and Pedro Ortiz Suarez and Aaron Gokaslan + and Shamik Bose and David Ifeoluwa Adelani and Long Phan and Hieu Tran and Ian Yu and Suhas Pai and Jenny Chim and + Violette Lepercq and Suzana Ilic and Margaret Mitchell and Alexandra Sasha Luccioni and Yacine Jernite}, + title={The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset}, + year={2022}, + cdate={1640995200000}, + url={http://papers.nips.cc/paper_files/paper/2022/hash/ce9e92e3de2372a4b93353eb7f3dc0bd-Abstract-Datasets_and_Benchmarks.html}, + booktitle={NeurIPS}, +} +""" + +_DATASETNAME = "roots_vi_ted" + +_DESCRIPTION = """ +ROOTS_vi_ted is a subset of Vietnamese in ted_talks_iwslt datasets. ted_talks_iwslt is a collection of the original Ted +talks and their translated version. The translations are available in more than 109+ languages, though the distribution +is not uniform. Before using this dataloader, please accept the acknowledgement at +https://huggingface.co/datasets/bigscience-data/roots_vi_ted_talks_iwslt and use huggingface-cli login for authentication. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/bigscience-data/roots_vi_ted_talks_iwslt" + +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value + +_LOCAL = False + +_URLS = {_DATASETNAME: {"train": "https://huggingface.co/datasets/bigscience-data/roots_vi_ted_talks_iwslt/resolve/main/data/train-00000-of-00001.parquet?download=true"}} + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class RootsViTedDataset(datasets.GeneratorBasedBuilder): + """RootsViTed is a subset of Vietnamese in ted_talks_iwslt datasets.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="roots_vi_ted_source", + version=SOURCE_VERSION, + description="roots_vi_ted source schema", + schema="source", + subset_id="roots_vi_ted", + ), + SEACrowdConfig( + name="roots_vi_ted_seacrowd_ssp", + version=SEACROWD_VERSION, + description="roots_vi_ted SEACrowd schema", + schema="seacrowd_ssp", + subset_id="roots_vi_ted", + ), + ] + + DEFAULT_CONFIG_NAME = "roots_vi_ted_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "meta": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_ssp": + features = schemas.self_supervised_pretraining.features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_dir, "split": "train"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + if self.config.schema == "source": + df = pd.read_parquet(filepath[split]) + for i, row in df.iterrows(): + yield i, { + "text": row["text"], + "meta": row["meta"], + } + + elif self.config.schema == "seacrowd_ssp": + df = pd.read_parquet(filepath[split]) + for i, row in df.iterrows(): + yield i, { + "id": str(i), + "text": row["text"], + } diff --git a/seacrowd/sea_datasets/saltik/__init__.py b/seacrowd/sea_datasets/saltik/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/saltik/saltik.py b/seacrowd/sea_datasets/saltik/saltik.py new file mode 100644 index 000000000..bcccef02b --- /dev/null +++ b/seacrowd/sea_datasets/saltik/saltik.py @@ -0,0 +1,133 @@ +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import jsonlines + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{, + author = {Audah, Hanif Arkan and Yuliawati, Arlisa and Alfina, Ika}, + title = {A Comparison Between SymSpell and a Combination of Damerau-Levenshtein Distance With the Trie Data Structure}, + journal = {2023 10th International Conference on Advanced Informatics: Concept, Theory and Application (ICAICTA)}, + volume = {}, + year = {2023}, + url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10390399&casa_token=HtJUCIGGlWYAAAAA:q8ll1RWmpHtSAq2Qp5uQAE1NJETx7tUYFZIvTO1IWoaYy4eqFETSsm9p6C7tJwLZBGq5y8zc3A&tag=1}, + doi = {}, + biburl = {https://github.com/ir-nlp-csui/saltik?tab=readme-ov-file#references}, + bibsource = {https://github.com/ir-nlp-csui/saltik?tab=readme-ov-file#references} +} +""" + +_DATASETNAME = "saltik" +_DESCRIPTION = """\ +Saltik is a dataset for benchmarking non-word error correction method accuracy in evaluating Indonesian words. +It consists of 58,532 non-word errors generated from 3,000 of the most popular Indonesian words. +""" +_HOMEPAGE = "https://github.com/ir-nlp-csui/saltik" +_LANGUAGES = ["ind"] +_LICENSE = Licenses.AGPL_3_0.value +_LOCAL = False +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/ir-nlp-csui/saltik/main/saltik.json", +} +_SUPPORTED_TASKS = [Tasks.NON_WORD_ERROR_SPELLING_CORRECTION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class Saltik(datasets.GeneratorBasedBuilder): + """It consists of 58,532 non-word errors generated from 3,000 of the most popular Indonesian words.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + # EX: Arbitrary NER type dataset + features = datasets.Features( + { + "id": datasets.Value("string"), + "word": datasets.Value("string"), + "errors": [ + { + "typo": datasets.Value("string"), + "error_type": datasets.Value("string"), + } + ], + } + ) + else: + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + file_path = dl_manager.download(urls) + data = self._read_jsonl(file_path) + all_words = list(data.keys()) + processed_data = [] + id = 0 + for word in all_words: + processed_data.append({"id": id, "word": word, "errors": data[word]}) + id += 1 + self._write_jsonl(file_path + ".jsonl", processed_data) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": file_path + ".jsonl", + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + if self.config.schema == "source": + i = 0 + with jsonlines.open(filepath) as f: + for each_data in f.iter(): + ex = { + "id": each_data["id"], + "word": each_data["word"], + "errors": each_data["errors"], + } + + yield i, ex + i += 1 + + def _read_jsonl(self, filepath: Path): + with open(filepath) as user_file: + parsed_json = json.load(user_file) + return parsed_json + + def _write_jsonl(self, filepath, values): + with jsonlines.open(filepath, "w") as writer: + for line in values: + writer.write(line) diff --git a/seacrowd/sea_datasets/sampiran/__init__.py b/seacrowd/sea_datasets/sampiran/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/sap_wat/__init__.py b/seacrowd/sea_datasets/sap_wat/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/sap_wat/sap_wat.py b/seacrowd/sea_datasets/sap_wat/sap_wat.py new file mode 100644 index 000000000..b638e17af --- /dev/null +++ b/seacrowd/sea_datasets/sap_wat/sap_wat.py @@ -0,0 +1,175 @@ +from pathlib import Path +from typing import List + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_DATASETNAME = "sap_wat" + +_LANGUAGES = ["eng", "ind", "zlm", "tha", "vie"] + +_CITATION = """\ +@inproceedings{buschbeck-exel-2020-parallel, + title = "A Parallel Evaluation Data Set of Software Documentation with Document Structure Annotation", + author = "Buschbeck, Bianka and + Exel, Miriam", + editor = "Nakazawa, Toshiaki and + Nakayama, Hideki and + Ding, Chenchen and + Dabre, Raj and + Kunchukuttan, Anoop and + Pa, Win Pa and + Bojar, Ond{\v{r}}ej and + Parida, Shantipriya and + Goto, Isao and + Mino, Hidaya and + Manabe, Hiroshi and + Sudoh, Katsuhito and + Kurohashi, Sadao and + Bhattacharyya, Pushpak", + booktitle = "Proceedings of the 7th Workshop on Asian Translation", + month = dec, + year = "2020", + address = "Suzhou, China", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.wat-1.20", + pages = "160--169", + abstract = "This paper accompanies the software documentation data set for machine translation, a parallel + evaluation data set of data originating from the SAP Help Portal, that we released to the machine translation + community for research purposes. It offers the possibility to tune and evaluate machine translation systems + in the domain of corporate software documentation and contributes to the availability of a wider range of + evaluation scenarios. The data set comprises of the language pairs English to Hindi, Indonesian, Malay and + Thai, and thus also increases the test coverage for the many low-resource language pairs. Unlike most evaluation + data sets that consist of plain parallel text, the segments in this data set come with additional metadata that + describes structural information of the document context. We provide insights into the origin and creation, the + particularities and characteristics of the data set as well as machine translation results.", +} + +""" + +_DESCRIPTION = """The data set originates from the SAP Help Portal that contains documentation for SAP products and user +assistance for product-related questions. The data has been processed in a way that makes it suitable as development and +test data for machine translation purposes. The current language scope is English to Hindi, Indonesian, Japanese, Korean, +Malay, Thai, Vietnamese, Simplified Chinese and Traditional Chinese. For each language pair about 4k segments are available, +split into development and test data. The segments are provided in their document context and are annotated with additional +metadata from the document.""" + +_HOMEPAGE = "https://github.com/SAP/software-documentation-data-set-for-machine-translation" + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_URLs = { + _DATASETNAME: "https://raw.githubusercontent.com/SAP/software-documentation-data-set-for-machine-translation/master/{split}_data/en{lang}/software_documentation.{split}.en{lang}.{appx}" +} + +_SUPPORTED_TASKS = [ + Tasks.MACHINE_TRANSLATION +] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +_SUBSET = ["id", "ms", "th", "vi"] + +class SapWatDataset(datasets.GeneratorBasedBuilder): + """SAP WAT is a software documentation dataset for machine translation. The current language scope is English to Hindi, + Indonesian, Japanese, Korean, Malay, Thai, Vietnamese, Simplified Chinese and Traditional Chinese. Here, we only consider + EN-ID, EN-TH, EN-MS, EN-VI""" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_en_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"SAP WAT source schema for EN-{lang.upper()}", + schema="source", + subset_id=f"{_DATASETNAME}_en_{lang}", + ) + for lang in _SUBSET] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_en_{lang}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"SAP WAT SEACrowd schema for EN-{lang.upper()}", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_en_{lang}", + ) + for lang in _SUBSET + ] + + DEFAULT_CONFIG_NAME = "sap_wat_en_id_source" + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + "label": datasets.Value("string") + } + ) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + lang = self.config.name.split("_")[3] + + splits = {datasets.Split.VALIDATION: "dev", datasets.Split.TEST: "test"} + data_urls = { + split: _URLs[_DATASETNAME].format(split=splits[split], lang=lang, appx=lang) for split in splits + } + dl_paths = dl_manager.download(data_urls) + + en_data_urls = { + split: _URLs[_DATASETNAME].format(split=splits[split], lang=lang, appx="en") for split in splits + } + en_dl_paths = dl_manager.download(en_data_urls) + return [ + datasets.SplitGenerator( + name=split, + gen_kwargs={"filepath": dl_paths[split], "en_filepath": en_dl_paths[split]}, + ) + for split in splits + ] + + def _generate_examples(self, filepath: Path, en_filepath: Path): + with open(en_filepath, "r") as f: + lines_1 = f.readlines() + with open(filepath, "r") as f: + lines_2 = f.readlines() + + if self.config.schema == "source": + for _id, (line_1, line_2) in enumerate(zip(lines_1, lines_2)): + ex = { + "id": _id, + "text": line_1.strip(), + "label": line_2.strip() + } + yield _id, ex + + elif self.config.schema == "seacrowd_t2t": + lang = self.config.name.split("_")[3] + lang_name = _LANGUAGES[_SUBSET.index(lang)+1] + + for _id, (line_1, line_2) in enumerate(zip(lines_1, lines_2)): + ex = { + "id": _id, + "text_1": line_1.strip(), + "text_2": line_2.strip(), + "text_1_name": 'eng', + "text_2_name": lang_name, + } + yield _id, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") \ No newline at end of file diff --git a/seacrowd/sea_datasets/sarawak_malay/__init__.py b/seacrowd/sea_datasets/sarawak_malay/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/sarawak_malay/sarawak_malay.py b/seacrowd/sea_datasets/sarawak_malay/sarawak_malay.py new file mode 100644 index 000000000..6c0579d82 --- /dev/null +++ b/seacrowd/sea_datasets/sarawak_malay/sarawak_malay.py @@ -0,0 +1,178 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import audiosegment +import datasets +import textgrid + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@INPROCEEDINGS{ +10337314, +author={Rahim, Mohd Zulhafiz and Juan, Sarah Samson and Mohamad, Fitri Suraya}, +booktitle={2023 International Conference on Asian Language Processing (IALP)}, +title={Improving Speaker Diarization for Low-Resourced Sarawak Malay Language Conversational Speech Corpus}, +year={2023}, +pages={228-233}, +keywords={Training;Oral communication;Data models;Usability;Speech processing;Testing;Speaker diarization;x-vectors;clustering;low-resource;auto-labeling;pseudo-labeling;unsupervised}, +doi={10.1109/IALP61005.2023.10337314}} +""" + +_DATASETNAME = "sarawak_malay" + +_DESCRIPTION = """\ +This is a Sarawak Malay conversation data for the purpose of speech technology research. \ +At the moment, this is an experimental data and currently used for investigating \ +speaker diarization. The data was collected by Faculty of Computer Science and \ +Information Technology, Universiti Malaysia Sarawak. The data consists of 38 conversations \ +that have been transcribed using Transcriber (see TextGrid folder), where each file \ +contains two speakers. Each conversation was recorded by different individuals using microphones \ +from mobile devices or laptops thus, different file formats were collected from the data collectors. \ +All data was then standardized to mono, 16000Khz, wav format. +""" + +_HOMEPAGE = "https://github.com/sarahjuan/sarawakmalay" + +_LANGUAGES = ["zlm"] + +_LICENSE = Licenses.CC0_1_0.value +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/sarahjuan/sarawakmalay/archive/refs/heads/main.zip", +} +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION, Tasks.TEXT_TO_SPEECH] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class SarawakMalayDataset(datasets.GeneratorBasedBuilder): + """This is experimental Sarawak Malay conversation data collected by \ + Universiti Malaysia Sarawak for speech technology research, \ + specifically speaker diarization. The data includes 38 conversations, \ + each with two speakers, recorded on various devices and then standardized to mono, \ + 16000Khz, wav format.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "sptext" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "speaker_id": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "text": datasets.Value("string"), + "metadata": { + "malay_text": datasets.Value("string"), + }, + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "sarawakmalay-main"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + id_counter = 0 + filenames = filter(lambda x: x.endswith(".wav"), os.listdir(f"{filepath}/wav")) + filenames = map(lambda x: x.replace(".wav", ""), filenames) + + os.makedirs(f"{filepath}/segmented", exist_ok=True) + for i, filename in enumerate(filenames): + info = textgrid.TextGrid.fromFile(f"{filepath}/TextGrid/{filename}.TextGrid") + if len(info) == 3: + sarawak_conversation, malay_conversation, speakers = info + else: + sarawak_conversation, malay_conversation, speakers, _ = info + + audio_file = audiosegment.from_file(f"{filepath}/wav/{filename}.wav").resample(sample_rate_Hz=16000) + + for sarawak_tg, malay_tg, speaker in zip(sarawak_conversation, malay_conversation, speakers): + start, end, text = sarawak_tg.minTime, sarawak_tg.maxTime, sarawak_tg.mark + malay_text = malay_tg.mark + speaker_id = speaker.mark + + start_sec, end_sec = int(start * 1000), int(end * 1000) + segment = audio_file[start_sec:end_sec] + segement_filename = f"{filepath}/segmented/{filename}-{round(start, 0)}-{round(end, 0)}.wav" + segment.export(segement_filename, format="wav") + + if self.config.schema == "source": + yield id_counter, { + "id": id_counter, + "speaker_id": speaker_id, + "path": f"{filepath}/wav/{filename}.wav", + "audio": segement_filename, + "text": text, + "metadata": { + "malay_text": malay_text, + }, + } + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + yield id_counter, {"id": id_counter, "speaker_id": speaker_id, "path": f"{filepath}/wav/{filename}.wav", "audio": segement_filename, "text": text, "metadata": {"speaker_age": None, "speaker_gender": None}} + + id_counter += 1 diff --git a/seacrowd/sea_datasets/scb_mt_en_th/__init__.py b/seacrowd/sea_datasets/scb_mt_en_th/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/scb_mt_en_th/scb_mt_en_th.py b/seacrowd/sea_datasets/scb_mt_en_th/scb_mt_en_th.py new file mode 100644 index 000000000..745cf7c89 --- /dev/null +++ b/seacrowd/sea_datasets/scb_mt_en_th/scb_mt_en_th.py @@ -0,0 +1,165 @@ +# coding=utf-8 +import json +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{Lowphansirikul2021, + author={Lowphansirikul, Lalita + and Polpanumas, Charin + and Rutherford, Attapol T. + and Nutanong, Sarana}, + title={A large English--Thai parallel corpus from the web and machine-generated text}, + journal={Language Resources and Evaluation}, + year={2021}, + month={Mar}, + day={30}, + issn={1574-0218}, + doi={10.1007/s10579-021-09536-6}, + url={https://doi.org/10.1007/s10579-021-09536-6} +""" + +_DATASETNAME = "scb_mt_en_th" + +_DESCRIPTION = """\ +A Large English-Thai Parallel Corpus The primary objective of our work is to build a large-scale English-Thai dataset +for machine translation. We construct an English-Thai machine translation dataset with over 1 million segment pairs, +curated from various sources, namely news, Wikipedia articles, SMS messages, task-based dialogs, web-crawled data and +government documents. Methodology for gathering data, building parallel texts and removing noisy sentence pairs are +presented in a reproducible manner. We train machine translation models based on this dataset. Our models' performance +are comparable to that of Google Translation API (as of May 2020) for Thai-English and outperform Google when the Open +Parallel Corpus (OPUS) is included in the training data for both Thai-English and English-Thai translation. The dataset, +pre-trained models, and source code to reproduce our work are available for public use. + +""" + +_HOMEPAGE = "https://github.com/vistec-AI/thai2nmt" + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_LANGUAGES = ["tha", "eng"] +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://archive.org/download/scb_mt_enth_2020/data.zip", +} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +SEACROWD_TO_SOURCE_LANGCODE_DICT = {"eng": "en", "tha": "th"} + + +class ScbMtEnThDataset(datasets.GeneratorBasedBuilder): + """ + A Large English-Thai Parallel Corpus The primary objective of our work is to build a large-scale English-Thai + dataset for machine translation. We construct an English-Thai machine translation dataset with over 1 million + segment pairs, curated from various sources, namely news, Wikipedia articles, SMS messages, task-based dialogs, + web-crawled data and government documents. + Methodology for gathering data, building parallel texts and removing noisy sentence pairs are presented in a + reproducible manner. We train machine translation models based on this dataset. Our models' performance are + comparable to that of Google Translation API (as of May 2020) for Thai-English and outperform Google when the Open + Parallel Corpus (OPUS) is included in the training data for both Thai-English and English-Thai translation. + The dataset,pre-trained models, and source code to reproduce our work are available for public use.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_tha_eng_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema: Thai to English", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_tha_eng_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema: Thai to English", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_eng_tha_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema: English to Thai", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_eng_tha_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema: English to Thai", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_tha_eng_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + language_pair = [SEACROWD_TO_SOURCE_LANGCODE_DICT[lang] for lang in self.config.name.split("_")[4:6]] + features = datasets.Features( + { + "translation": datasets.features.Translation(language_pair), + "subdataset": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_path = dl_manager.download_and_extract(urls) + data_dir = os.path.join(data_path, "data") + + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": os.path.join(data_dir, "train.jsonl")}), + datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": os.path.join(data_dir, "valid.jsonl")}), + datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "test.jsonl")}), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + with open(filepath, encoding="utf-8") as f: + if self.config.schema == "source": + language_pair = [SEACROWD_TO_SOURCE_LANGCODE_DICT[lang] for lang in self.config.name.split("_")[4:6]] + source, target = language_pair + for id_, row in enumerate(f): + data = json.loads(row) + yield id_, { + "translation": {source: data[source], target: data[target]}, + "subdataset": data["subdataset"], + } + + elif self.config.schema == "seacrowd_t2t": + source, target = self.config.name.split("_")[4:6] + for id_, row in enumerate(f): + data = json.loads(row) + ex = { + "id": str(id_), + "text_1": data[SEACROWD_TO_SOURCE_LANGCODE_DICT[source]], + "text_2": data[SEACROWD_TO_SOURCE_LANGCODE_DICT[target]], + "text_1_name": source, + "text_2_name": target, + } + yield id_, ex diff --git a/seacrowd/sea_datasets/sea_bench/__init__.py b/seacrowd/sea_datasets/sea_bench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/sea_bench/sea_bench.py b/seacrowd/sea_datasets/sea_bench/sea_bench.py new file mode 100644 index 000000000..f57b3206b --- /dev/null +++ b/seacrowd/sea_datasets/sea_bench/sea_bench.py @@ -0,0 +1,193 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{damonlpsg2023seallm, + author = {Xuan-Phi Nguyen*, Wenxuan Zhang*, Xin Li*, Mahani Aljunied*, + Qingyu Tan, Liying Cheng, Guanzheng Chen, Yue Deng, Sen Yang, + Chaoqun Liu, Hang Zhang, Lidong Bing}, + title = {SeaLLMs - Large Language Models for Southeast Asia}, + year = 2023, + Eprint = {arXiv:2312.00738}, + url = {https://arxiv.org/pdf/2312.00738.pdf}, +} +""" + +_DATASETNAME = "sea_bench" + +_DESCRIPTION = """\ +Sea-bench is a multilingual benchmark for assistant-style models annotated by native linguists +covering 8 Southeast Asian languages. The linguists sourced such data by manually translating +open-source English test sets, collecting real user questions from local forums and websites, +collecting real math and reasoning questions from reputable sources, as well as writing test +instructions and questions themselves. The Sea-bench test set contains 20 questions per task +(5 tasks for 3 languages, 4 tasks for other 5 languages). +""" + +_HOMEPAGE = "https://huggingface.co/datasets/SeaLLMs/Sea-bench" + +_LANGUAGES = ["eng", "ind", "khm", "lao", "mya", "tgl", "tha", "vie", "zlm"] + +_LICENSE = Licenses.APACHE_2_0.value + +_LOCAL = False + +_URLS = "https://huggingface.co/datasets/SeaLLMs/Sea-bench/raw/main/question.jsonl" + +_SUPPORTED_TASKS = [Tasks.INSTRUCTION_TUNING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class SeaBenchDataset(datasets.GeneratorBasedBuilder): + """ + Sea-bench is a multilingual benchmark from https://huggingface.co/datasets/SeaLLMs/Sea-bench. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + LANGUAGES_DICT = {"tgl": "tl", "khm": "km", "vie": "vi", "tha": "th", "lao": "lo", "mya": "my", "ind": "id", "zlm": "ms", "eng": "en"} + + BUILDER_CONFIGS = ( + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for all 8 languages", + schema="source", + subset_id=f"{_DATASETNAME}", + ) + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME}_{lang} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{lang}", + ) + for lang in LANGUAGES_DICT + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for T2T for all 8 languages", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}", + ) + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME}_{lang} SEACrowd schema for T2T", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_{lang}", + ) + for lang in LANGUAGES_DICT + ] + ) + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "question_id": datasets.Value("int64"), + "category": datasets.Value("string"), + "lang": datasets.Value("string"), + "turns": datasets.Sequence(datasets.Value("string")), + "chatgpt_response": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + train_path = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_path, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + subset_id = self.config.subset_id.split("_") + if len(subset_id) > 2: + language_list = subset_id[2] + if language_list in self.LANGUAGES_DICT: + language_list = [self.LANGUAGES_DICT[language_list]] + else: + language_list = list(self.LANGUAGES_DICT.values()) + + idx = 0 + with open(filepath, "r") as f: + data = list(map(json.loads, f)) + if self.config.schema == "source": + for d in data: + if d["lang"] in language_list: + x = {k: v if v != "" and k in self.info.features else None for k, v in d.items()} + if "chatgpt_response" not in x: + x["chatgpt_response"] = "" + yield idx, x + idx += 1 + elif self.config.schema == "seacrowd_t2t": + for d in data: + if d["lang"] in language_list: + x = { + "id": idx, + "text_1": d["turns"][0] if "turns" in d else "", + "text_2": d["chatgpt_response"] if "chatgpt_response" in d else "", + "text_1_name": "turns", + "text_2_name": "chatgpt_response", + } + yield idx, x + idx += 1 + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") diff --git a/seacrowd/sea_datasets/seaeval/__init__.py b/seacrowd/sea_datasets/seaeval/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/seaeval/seaeval.py b/seacrowd/sea_datasets/seaeval/seaeval.py new file mode 100644 index 000000000..db13204d9 --- /dev/null +++ b/seacrowd/sea_datasets/seaeval/seaeval.py @@ -0,0 +1,238 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{SeaEval2023, + title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, + author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, + journal={arXiv preprint arXiv:2309.04766}, + year={2023}, + url={https://github.com/SeaEval/SeaEval} +} +""" + +_DATASETNAME = "seaeval" + +_DESCRIPTION = """\ +SeaEval is a benchmark toolkit for evaluating multilingual LLMs. The benchmark contains 28 datasets, +covering 7 languages. It contains 2 datasets for cross-lingual consistency, each containing parallel +questions for the 7 represented languages. It alsoc ontains 4 datasets for cultural reasoning +(multiple choice Q&A) that are in English but focused on regions including Singapore and Philipines. + +This dataloader provides examples for Indonesia, Vietnamese, Malay, and Filipino. +""" + +_HOMEPAGE = "https://github.com/SeaEval/SeaEval" + +_LANGUAGES = {"ind": "Indonesian", "vie": "Vietnamese", "zlm": "Malay", "fil": "Filipino"} + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_LOCAL = False + +_URLS = { + "cross_mmlu": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/cross_mmlu.json", + "cross_logiqa": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/cross_logiqa.json", + "sg_eval": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/sg_eval.json", + "ph_eval": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/ph_eval.json", +} + +_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING, Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class SeaEvalDataset(datasets.GeneratorBasedBuilder): + """ + SeaEval is a benchmark for evaluating multilingual LLMs from https://github.com/SeaEval/SeaEval. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + LANGUAGES_EXCHANGED = dict((v, k) for k, v in _LANGUAGES.items()) + SUBSETS_CROSS_MMLU = ["cross_mmlu_" + lang for lang in _LANGUAGES.keys()] + SUBSETS_CROSS_LOGIQA = ["cross_logiqa_" + lang for lang in _LANGUAGES.keys()] + SUBSETS = SUBSETS_CROSS_MMLU + SUBSETS_CROSS_LOGIQA + ["sg_eval_eng", "ph_eval_eng"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME}_{subset} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SUBSETS + ] + + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_seacrowd_qa", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME}_{subset} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SUBSETS + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source" and self.config.subset_id not in ["cross_logiqa", "ph_eval"]: + features = datasets.Features( + { + "id": datasets.Value("string"), + "question": datasets.Value("string"), + "choices": datasets.Sequence(datasets.Value("string")), + "answer": datasets.Value("string"), + } + ) + elif self.config.schema == "source" and self.config.subset_id == "cross_logiqa": + features = datasets.Features( + { + "id": datasets.Value("string"), + "question": datasets.Value("string"), + "context": datasets.Value("string"), + "choices": datasets.Sequence(datasets.Value("string")), + "answer": datasets.Value("string"), + } + ) + elif self.config.schema == "source" and self.config.subset_id == "ph_eval": + features = datasets.Features( + { + "id": datasets.Value("string"), + "question": datasets.Value("string"), + "choices": datasets.Sequence(datasets.Value("string")), + "answer": datasets.Value("string"), + "category": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + else: + raise ValueError(f"Unexpected schema received! {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + data = {key: dl_manager.download_and_extract(value) for key, value in _URLS.items()} + + paths = {} + file = self.config.subset_id.split("_") + file = "_".join(file[1:3]) + paths[self.config.subset_id] = data[file] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "paths": paths, + "split": "test", + }, + ), + ] + + def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + + language = self.config.subset_id.split("_")[3] + examples = None + + for key, path in paths.items(): + if "cross" in key: + data = pd.read_json(path).rename(columns=self.LANGUAGES_EXCHANGED) + data = pd.melt(data, id_vars=["id"], value_vars=_LANGUAGES.keys(), var_name="language") + data_flattened = pd.json_normalize(data["value"]) + data_merged = pd.merge(data, data_flattened, left_index=True, right_index=True) + data_filtered = data_merged[data_merged["language"] == language].drop(columns=["value", "language"]) + examples = data_filtered.to_records() + elif "eval" in key: + data = pd.read_json(path) + examples = data.to_records() + + idx = 0 + if self.config.schema == "source" and self.config.subset_id not in ["cross_logiqa", "ph_eval"]: + for row in examples: + x = { + "id": row["id"], + "question": row["question"], + "choices": row["choices"], + "answer": row["answer"], + } + yield idx, x + idx += 1 + elif self.config.schema == "source" and self.config.subset_id == "cross_logiqa": + for row in examples: + x = { + "id": row["id"], + "question": row["question"], + "context": row["context"] if "context" in row else None, + "choices": row["choices"], + "answer": row["answer"], + } + yield idx, x + idx += 1 + elif self.config.schema == "source" and self.config.subset_id == "ph_eval": + for row in examples: + x = { + "id": row["id"], + "question": row["question"], + "choices": row["choices"], + "answer": row["answer"], + "category": row["category"] if "category" in row else None, + } + yield idx, x + idx += 1 + elif self.config.schema == "seacrowd_qa": + for row in examples: + x = { + "id": idx, + "question_id": row["id"], + "document_id": row["id"], + "question": row["question"], + "type": "multiple_choice", + "choices": row["choices"], + "context": row["context"] if "context" in row else None, + "answer": [row["answer"]], + "meta": {}, + } + yield idx, x + idx += 1 + else: + raise ValueError(f"Invalid schema: {self.config.schema}") diff --git a/seacrowd/sea_datasets/seahorse/__init__.py b/seacrowd/sea_datasets/seahorse/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/seahorse/seahorse.py b/seacrowd/sea_datasets/seahorse/seahorse.py new file mode 100644 index 000000000..398f920fe --- /dev/null +++ b/seacrowd/sea_datasets/seahorse/seahorse.py @@ -0,0 +1,194 @@ +from pathlib import Path + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproceedings{clark-etal-2023-seahorse, + title = "{SEAHORSE}: A Multilingual, Multifaceted Dataset for Summarization Evaluation", + author = "Clark, Elizabeth and + Rijhwani, Shruti and + Gehrmann, Sebastian and + Maynez, Joshua and + Aharoni, Roee and + Nikolaev, Vitaly and + Sellam, Thibault and + Siddhant, Aditya and + Das, Dipanjan and + Parikh, Ankur", + editor = "Bouamor, Houda and + Pino, Juan and + Bali, Kalika", + booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2023", + address = "Singapore", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.emnlp-main.584", + doi = "10.18653/v1/2023.emnlp-main.584", + pages = "9397--9413", +} +""" + +_DATASETNAME = "seahorse" + +_DESCRIPTION = """ +SEAHORSE is a dataset for multilingual, multifaceted summarization evaluation. It consists of 96K summaries with human +ratings along 6 quality dimensions: comprehensibility, repetition, grammar, attribution, main idea(s), and conciseness, +covering 6 languages, 9 systems and 4 datasets. +""" + +_HOMEPAGE = "https://github.com/google-research-datasets/seahorse" + +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = "https://storage.googleapis.com/seahorse-public/seahorse_data.zip" + +_SUPPORTED_TASKS = [Tasks.SUMMARIZATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +# The original dataset only contaions gem_id, we need to retrieve the article following https://github.com/google-research-datasets/seahorse?tab=readme-ov-file#retrieving-articles-from-gem +def get_wikilingual_data(lang, split): + ds = datasets.load_dataset("gem", name=f"wiki_lingua_{lang}", split=split) + df = ds.to_pandas() + return dict(zip(*[df[col] for col in ["gem_id", "source"]])) + + +def get_xlsum_data(lang, split): + df = datasets.load_dataset("GEM/xlsum", lang) + return {item["gem_id"]: item["text"] for item in df[split]} + + +# Both train and validation splits in seahorse are taken from the validation split from the original dataset +_WIKILINGUAL_DATA = {split: get_wikilingual_data("vietnamese_vi", split) for split in ["test", "validation"]} +_XLSUM_DATA = {split: get_xlsum_data("vietnamese", split) for split in ["test", "validation"]} + + +def get_article(gem_id, split): + if "wiki_lingua" in gem_id: + data = _WIKILINGUAL_DATA + elif "xlsum" in gem_id: + data = _XLSUM_DATA + else: + raise AssertionError("gem_id should either from wiki_lingua or xlsum.") + return data[split if split == "test" else "validation"][gem_id] + + +class SeahorseDataset(datasets.GeneratorBasedBuilder): + """Seahorse is a dataset for multilingual, multifaceted summarization evaluation.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "gem_id": datasets.Value("string"), + "summary": datasets.Value("string"), + "model": datasets.Value("string"), + "question1": datasets.Value("string"), + "question2": datasets.Value("string"), + "question3": datasets.Value("string"), + "question4": datasets.Value("string"), + "question5": datasets.Value("string"), + "question6": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> list[datasets.SplitGenerator]: + data_dir = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": f"{data_dir}/seahorse_data/train.tsv", + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": f"{data_dir}/seahorse_data/validation.tsv", + "split": "dev", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": f"{data_dir}/seahorse_data/test.tsv", + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> tuple[int, dict]: + df = pd.read_csv(filepath, sep="\t") + mask = df["worker_lang"] == "vi" + df_vi = df[mask] + if self.config.schema == "source": + for i, row in df_vi.iterrows(): + yield i, { + "gem_id": row["gem_id"], + "summary": row["summary"], + "model": row["model"], + "question1": row["question1"], + "question2": row["question2"], + "question3": row["question3"], + "question4": row["question4"], + "question5": row["question5"], + "question6": row["question6"], + } + + elif self.config.schema == "seacrowd_t2t": + for i, row in df_vi.iterrows(): + yield i, { + "id": str(i), + "text_1": get_article(row["gem_id"], split), + "text_2": row["summary"], + "text_1_name": "article", + "text_2_name": "summary", + } diff --git a/seacrowd/sea_datasets/snli_indo/__init__.py b/seacrowd/sea_datasets/snli_indo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/snli_indo/snli_indo.py b/seacrowd/sea_datasets/snli_indo/snli_indo.py new file mode 100644 index 000000000..2c8f4d5fc --- /dev/null +++ b/seacrowd/sea_datasets/snli_indo/snli_indo.py @@ -0,0 +1,158 @@ +""" +SNLI Indo is derived from the SNLI corpus, where the premise and hypothesis sentences are translated directly from English to Indonesian using the Google Cloud Translation API. The SNLI corpus is divided into three sets, namely train, development, and test set. The translation process is applied to all the premise and hypothesis sentences in all the three sets. This ensures that the number of sentence pairs obtained is the same as the original SNLI dataset, namely 570k sentence pairs. A filtering process is carried out to remove incomplete sentence pairs and those with a gold label `-`. As a result, 569,027 sentence pairs are obtained. +""" + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import jsonlines + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{suwija2023snli, + author = "Suwija Putra, I Made + and Siahaan, Daniel + and Saikhu, Ahmad", + title = "SNLI Indo: A recognizing textual entailment dataset in Indonesian derived from the Stanford Natural Language Inference dataset" + year = "2024", + journal = "Data in Brief", + volume = "52", + pages = "109998", + publisher = "Elsevier", + doi = "https://doi.org/10.1016/j.dib.2023.109998", + url = "https://www.sciencedirect.com/science/article/pii/S2352340923010284", +} +""" + +_DATASETNAME = "snli_indo" + +_DESCRIPTION = """\ +The SNLI Indo dataset is derived from the SNLI corpus by translating each premise and hypothesis sentence from English to Indonesia via the Google Cloud Translation API. Premise sentences are crawled image captions from Flickr, and hypothesis sentences are manually created through crowdsourcing. Five annotators are assigned per sentence pair to label the inference relationship as entailment (true), contradiction (false) or neutral (undetermined). +""" + +_HOMEPAGE = "https://data.mendeley.com/datasets/k4tjhzs2gd/1" + +_LANGUAGES = ["ind"] + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + "train": "https://data.mendeley.com/public-files/datasets/k4tjhzs2gd/files/ee45b2bb-e2ea-47b7-bec4-b6653c467d27/file_downloaded", + "val": "https://data.mendeley.com/public-files/datasets/k4tjhzs2gd/files/5e47db3c-ea84-4c73-9a2f-bfd57b4e2c05/file_downloaded", + "test": "https://data.mendeley.com/public-files/datasets/k4tjhzs2gd/files/23aff85c-ff72-48b6-aba1-c1dd5dac216b/file_downloaded", + } +} + +_SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class SNLIDataset(datasets.GeneratorBasedBuilder): + """SNLI Indo is derived from the SNLI corpus, where the premise and hypothesis sentences are translated directly from English to Indonesian using the Google Cloud Translation API. This dataset contains ~570k annotated sentence pairs.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="snli_indo_source", + version=SOURCE_VERSION, + description="SNLI Indo source schema", + schema="source", + subset_id="snli_indo", + ), + SEACrowdConfig( + name="snli_indo_seacrowd_pairs", + version=SEACROWD_VERSION, + description="SNLI Indo SEACrowd schema", + schema="seacrowd_pairs", + subset_id="snli_indo", + ), + ] + + DEFAULT_CONFIG_NAME = "snli_source" + labels = ["kontradiksi", "keterlibatan", "netral"] # ["contradiction", "entailment", "neutral" ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.ClassLabel(names=self.labels), + } + ) + + elif self.config.schema == "seacrowd_pairs": + features = schemas.pairs_features(self.labels) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir["train"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir["test"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir["val"], + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + with jsonlines.open(filepath) as f: + i = -1 + for example in f.iter(): + i += 1 + yield str(i), { + "premise": example["kalimat1"], + "hypothesis": example["kalimat2"], + "label": example["label emas"], + } + + elif self.config.schema == "seacrowd_pairs": + with jsonlines.open(filepath) as f: + i = -1 + for example in f.iter(): + i += 1 + yield str(i), { + "id": str(i), + "text_1": example["kalimat1"], + "text_2": example["kalimat2"], + "label": example["label emas"], + } diff --git a/seacrowd/sea_datasets/spamid_pair/__init__.py b/seacrowd/sea_datasets/spamid_pair/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/spamid_pair/spamid_pair.py b/seacrowd/sea_datasets/spamid_pair/spamid_pair.py new file mode 100644 index 000000000..a22f851af --- /dev/null +++ b/seacrowd/sea_datasets/spamid_pair/spamid_pair.py @@ -0,0 +1,160 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{Chrismanto2022, +title = {SPAMID-PAIR: A Novel Indonesian Post–Comment Pairs Dataset Containing Emoji}, +journal = {International Journal of Advanced Computer Science and Applications}, +doi = {10.14569/IJACSA.2022.0131110}, +url = {http://dx.doi.org/10.14569/IJACSA.2022.0131110}, +year = {2022}, +publisher = {The Science and Information Organization}, +volume = {13}, +number = {11}, +author = {Antonius Rachmat Chrismanto and Anny Kartika Sari and Yohanes Suyanto} +} +""" + +_DATASETNAME = "spamid_pair" + + +_DESCRIPTION = """\ +SPAMID-PAIR is data post-comment pairs collected from 13 selected Indonesian public figures (artists) / public accounts +with more than 15 million followers and categorized as famous artists. +It was collected from Instagram using an online tool and Selenium. +Two persons labeled all pair data as an expert in a total of 72874 data. +The data contains Unicode text (UTF-8) and emojis scrapped in posts and comments without account profile information. +""" + +_HOMEPAGE = "https://data.mendeley.com/datasets/fj5pbdf95t/1" + +_LANGUAGES = ["ind"] + + +_LICENSE = Licenses.CC_BY_4_0.value + +_LOCAL = False + + +_URLS = { + _DATASETNAME: "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/fj5pbdf95t-1.zip", +} + +_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class SpamidPairDataset(datasets.GeneratorBasedBuilder): + """SPAMID-PAIR is data post-comment pairs collected from 13 selected Indonesian public figures (artists) / public accounts with more than 15 million followers and categorized as famous artists.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + LABEL_CLASSES = [1, 0] + + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "igid": datasets.Value("string"), + "comment": datasets.Value("string"), + "posting": datasets.Value("string"), + "spam": datasets.ClassLabel(names=self.LABEL_CLASSES), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(self.LABEL_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = Path(dl_manager.download_and_extract(urls)) + data_dir = os.path.join(os.path.join(os.path.join(data_dir, "SPAMID-PAIR"), "Raw"), "dataset-raw.xlsx") + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data = pd.read_excel(filepath) + + if self.config.schema == "source": + for i, row in data.iterrows(): + yield i, { + "igid": str(row["igid"]), + "comment": str(row["comment"]), + "posting": str(row["posting"]), + "spam": row["spam"], + } + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for i, row in data.iterrows(): + yield i, { + "id": str(i), + "text": str(row["comment"]) + "\n" + str(row["posting"]), + "label": int(row["spam"]), + } diff --git a/seacrowd/sea_datasets/stb_ext/__init__.py b/seacrowd/sea_datasets/stb_ext/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/stb_ext/stb_ext.py b/seacrowd/sea_datasets/stb_ext/stb_ext.py new file mode 100644 index 000000000..d0f65228f --- /dev/null +++ b/seacrowd/sea_datasets/stb_ext/stb_ext.py @@ -0,0 +1,195 @@ +import io + +import conllu +import datasets + +from seacrowd.utils.common_parser import load_ud_data_as_seacrowd_kb +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils import schemas +from seacrowd.utils.constants import DEFAULT_SEACROWD_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Licenses, Tasks + +_DATASETNAME = "stb_ext" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME + +_LANGUAGES = ["eng"] +_LOCAL = False +_CITATION = """\ +@article{wang2019genesis, +title={From genesis to creole language: Transfer learning for singlish universal dependencies parsing and POS tagging}, +author={Wang, Hongmin and Yang, Jie and Zhang, Yue}, +journal={ACM Transactions on Asian and Low-Resource Language Information Processing (TALLIP)}, +volume={19}, +number={1}, +pages={1--29}, +year={2019}, +publisher={ACM New York, NY, USA} +} +""" + +_DESCRIPTION = """\ +We adopt the Universal Dependencies protocol for constructing the Singlish dependency treebank, both as a new resource +for the low-resource languages and to facilitate knowledge transfer from English. Briefly, the STB-EXT dataset offers +a 3-times larger training set, while keeping the same dev and test sets from STB-ACL. We provide treebanks with both +gold-standard as well as automatically generated POS tags. +""" + +_HOMEPAGE = "https://github.com/wanghm92/Sing_Par/tree/master/TALLIP19_dataset/treebank" + +_LICENSE = Licenses.MIT.value + +_PREFIX = "https://raw.githubusercontent.com/wanghm92/Sing_Par/master/TALLIP19_dataset/treebank/" +_URLS = { + "gold_pos": { + "train": _PREFIX + "gold_pos/train.ext.conll", + }, + "en_ud_autopos": {"train": _PREFIX + "en-ud-autopos/en-ud-train.conllu.autoupos", "validation": _PREFIX + "en-ud-autopos/en-ud-dev.conllu.ann.auto.epoch24.upos", "test": _PREFIX + "en-ud-autopos/en-ud-test.conllu.ann.auto.epoch24.upos"}, + "auto_pos_multiview": { + "train": _PREFIX + "auto_pos/multiview/train.autopos.multiview.conll", + "validation": _PREFIX + "auto_pos/multiview/dev.autopos.multiview.conll", + "test": _PREFIX + "auto_pos/multiview/test.autopos.multiview.conll", + }, + "auto_pos_stack": { + "train": _PREFIX + "auto_pos/stack/train.autopos.stack.conll", + "validation": _PREFIX + "auto_pos/stack/dev.autopos.stack.conll", + "test": _PREFIX + "auto_pos/stack/test.autopos.stack.conll", + }, +} +_POSTAGS = ["ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "root"] +_SUPPORTED_TASKS = [Tasks.POS_TAGGING, Tasks.DEPENDENCY_PARSING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +def config_constructor(subset_id, schema, version): + return SEACrowdConfig(name=f"{_DATASETNAME}_{subset_id}_{schema}", + version=datasets.Version(version), description=_DESCRIPTION, + schema=schema, subset_id=subset_id) + + +class StbExtDataset(datasets.GeneratorBasedBuilder): + """This is a seacrowd dataloader for the STB-EXT dataset, which offers a 3-times larger training set, while keeping + the same dev and test sets from STB-ACL. It provides treebanks with both gold-standard and automatically generated POS tags.""" + + BUILDER_CONFIGS = [ + # source + config_constructor(subset_id="auto_pos_stack", schema="source", version=_SOURCE_VERSION), + config_constructor(subset_id="auto_pos_multiview", schema="source", version=_SOURCE_VERSION), + config_constructor(subset_id="en_ud_autopos", schema="source", version=_SOURCE_VERSION), + config_constructor(subset_id="gold_pos", schema="source", version=_SOURCE_VERSION), + # seq_label + config_constructor(subset_id="auto_pos_stack", schema="seacrowd_seq_label", version=_SEACROWD_VERSION), + config_constructor(subset_id="auto_pos_multiview", schema="seacrowd_seq_label", version=_SEACROWD_VERSION), + config_constructor(subset_id="en_ud_autopos", schema="seacrowd_seq_label", version=_SEACROWD_VERSION), + config_constructor(subset_id="gold_pos", schema="seacrowd_seq_label", version=_SEACROWD_VERSION), + # dependency parsing + config_constructor(subset_id="auto_pos_stack", schema="seacrowd_kb", version=_SEACROWD_VERSION), + config_constructor(subset_id="auto_pos_multiview", schema="seacrowd_kb", version=_SEACROWD_VERSION), + config_constructor(subset_id="en_ud_autopos", schema="seacrowd_kb", version=_SEACROWD_VERSION), + config_constructor(subset_id="gold_pos", schema="seacrowd_kb", version=_SEACROWD_VERSION), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_gold_pos_source" + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + # metadata + "sent_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_en": datasets.Value("string"), + # tokens + "id": [datasets.Value("string")], + "form": [datasets.Value("string")], + "lemma": [datasets.Value("string")], + "upos": [datasets.Value("string")], + "xpos": [datasets.Value("string")], + "feats": [datasets.Value("string")], + "head": [datasets.Value("string")], + "deprel": [datasets.Value("string")], + "deps": [datasets.Value("string")], + "misc": [datasets.Value("string")], + } + ) + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(label_names=_POSTAGS) + elif self.config.schema == "seacrowd_kb": + features = schemas.kb_features + else: + raise ValueError(f"Invalid config: {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """ "return splitGenerators""" + urls = _URLS[self.config.subset_id] + downloaded_files = dl_manager.download_and_extract(urls) + splits = [] + if "train" in downloaded_files: + splits.append(datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]})) + if "validation" in downloaded_files: + splits.append(datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["validation"]})) + if "test" in downloaded_files: + splits.append(datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]})) + return splits + + def _generate_examples(self, filepath): + def process_buffer(TextIO): + BOM = "\ufeff" + buffer = io.StringIO() + for line in TextIO: + line = line.replace(BOM, "") if BOM in line else line + buffer.write(line) + buffer.seek(0) + return buffer + + with open(filepath, "r", encoding="utf-8") as data_file: + tokenlist = list(conllu.parse_incr(process_buffer(data_file))) + data_instances = [] + for idx, sent in enumerate(tokenlist): + idx = sent.metadata["sent_id"] if "sent_id" in sent.metadata else idx + tokens = [token["form"] for token in sent] + txt = sent.metadata["text"] if "text" in sent.metadata else " ".join(tokens) + example = { + # meta + "sent_id": str(idx), + "text": txt, + "text_en": txt, + # tokens + "id": [token["id"] for token in sent], + "form": [token["form"] for token in sent], + "lemma": [token["lemma"] for token in sent], + "upos": [token["upos"] for token in sent], + "xpos": [token["xpos"] for token in sent], + "feats": [str(token["feats"]) for token in sent], + "head": [str(token["head"]) for token in sent], + "deprel": [str(token["deprel"]) for token in sent], + "deps": [str(token["deps"]) for token in sent], + "misc": [str(token["misc"]) for token in sent] + } + data_instances.append(example) + + if self.config.schema == "source": + pass + if self.config.schema == "seacrowd_seq_label": + data_instances = list( + map( + lambda d: { + "id": d["sent_id"], + "tokens": d["form"], + "labels": d["upos"], + }, + data_instances, + ) + ) + if self.config.schema == "seacrowd_kb": + data_instances = load_ud_data_as_seacrowd_kb(filepath, data_instances) + for key, exam in enumerate(data_instances): + yield key, exam diff --git a/seacrowd/sea_datasets/struct_amb_ind/__init__.py b/seacrowd/sea_datasets/struct_amb_ind/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/struct_amb_ind/struct_amb_ind.py b/seacrowd/sea_datasets/struct_amb_ind/struct_amb_ind.py new file mode 100644 index 000000000..5cd87002c --- /dev/null +++ b/seacrowd/sea_datasets/struct_amb_ind/struct_amb_ind.py @@ -0,0 +1,174 @@ +import os +from itertools import chain +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{widiaputri-etal-5641, + author = {Widiaputri, Ruhiyah Faradishi and Purwarianti, Ayu and Lestari, Dessi Puji and Azizah, Kurniawati and Tanaya, Dipta and Sakti, Sakriani}, + title = {Speech Recognition and Meaning Interpretation: Towards Disambiguation of Structurally Ambiguous Spoken Utterances in Indonesian}, + booktitle = {Proceedings of the EMNLP 2023}, + year = {2023} +} +""" + +_DATASETNAME = "struct_amb_ind" + +_DESCRIPTION = """ +This dataset contains the first Indonesian speech dataset for structurally ambiguous utterances and each of transcription and two disambiguation texts. +The structurally ambiguous sentences were adapted from Types 4,5,6, and 10 of Types Of Syntactic Ambiguity in English by [Taha et al., 1983]. +For each chosen type, 100 structurally ambiguous sentences in Indonesian were made by crowdsourcing. +Each Indonesian ambiguous sentence has two possible interpretations, resulting in two disambiguation text outputs for each ambiguous sentence. +Each disambiguation text is made up of two sentences. All of the sentences have been checked by linguists. +""" + +_HOMEPAGE = "https://github.com/ha3ci-lab/struct_amb_ind" + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = True # get the audio data externally from https://drive.google.com/drive/folders/1QeaptstBgwGYO6THGkZHHViExrogCMUj +_LANGUAGES = ["ind"] + +_URL_TEMPLATES = { + "keys": "https://raw.githubusercontent.com/ha3ci-lab/struct_amb_ind/main/keys/train_dev_test_spk_keys/", + "text": "https://raw.githubusercontent.com/ha3ci-lab/struct_amb_ind/main/text/", +} + +_URLS = { + "split_train": _URL_TEMPLATES["keys"] + "train_spk", + "split_dev": _URL_TEMPLATES["keys"] + "dev_spk", + "split_test": _URL_TEMPLATES["keys"] + "test_spk", + "text_transcript": _URL_TEMPLATES["text"] + "ID_amb_disam_transcript.txt", +} + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class StructAmbInd(datasets.GeneratorBasedBuilder): + """ + This dataset contains the first Indonesian speech dataset for structurally ambiguous utterances and each of transcription and two disambiguation texts. + This dataloader does NOT contain the additional training data for as mentioned in the _HOMEPAGE, as it is already implemented in the dataloader "indspeech_news_lvcsr". + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_sptext", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_sptext", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "speaker_id": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "amb_transcript": datasets.Value("string"), + "disam_text": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_sptext": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + # The data_dir configuration is required ONLY for the audio_urls. + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + # load the local audio folders + audio_urls = [data_dir + "/" + f"{gender}{_id:02}.zip" for gender in ["F", "M"] for _id in range(1, 12, 1)] + audio_files_dir = [Path(dl_manager.extract(audio_url)) / audio_url.split("/")[-1][:-4] for audio_url in audio_urls] + # load the speaker splits and transcript + split_train = Path(dl_manager.download(_URLS["split_train"])) + split_dev = Path(dl_manager.download(_URLS["split_dev"])) + split_test = Path(dl_manager.download(_URLS["split_test"])) + text_transcript = Path(dl_manager.download(_URLS["text_transcript"])) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"split": split_train, "transcript": text_transcript, "audio_files_dir": audio_files_dir}, + ), + datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"split": split_dev, "transcript": text_transcript, "audio_files_dir": audio_files_dir}), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"split": split_test, "transcript": text_transcript, "audio_files_dir": audio_files_dir}, + ), + ] + + def _generate_examples(self, split: Path, transcript: Path, audio_files_dir: List[Path]) -> Tuple[int, Dict]: + speaker_ids = open(split, "r").readlines() + speaker_ids = [id.replace("\n", "") for id in speaker_ids] + speech_folders = [audio_folder for audio_folder in audio_files_dir if audio_folder.name.split("/")[-1] in speaker_ids] + speech_files = list(chain(*[list(map((str(speech_folder) + "/").__add__, os.listdir(speech_folder))) for speech_folder in speech_folders])) + + transcript = open(transcript, "r").readlines() + transcript = [sent.replace("\n", "").split("|") for sent in transcript] + transcript_dict = {sent[0]: {"amb_transcript": sent[1], "disam_text": sent[2]} for sent in transcript} + + for key, aud_file in enumerate(speech_files): + aud_id = aud_file.split("/")[-1][:-4] + aud_info = aud_id.split("_") + if self.config.schema == "source": + row = { + "id": aud_id, + "speaker_id": aud_info[1], + "path": aud_file, + "audio": aud_file, + "amb_transcript": transcript_dict[aud_id]["amb_transcript"], + "disam_text": transcript_dict[aud_id]["disam_text"], + } + yield key, row + elif self.config.schema == "seacrowd_sptext": + row = { + "id": aud_id, + "path": aud_file, + "audio": aud_file, + "text": transcript_dict[aud_id]["amb_transcript"], + "speaker_id": aud_info[1], + "metadata": { + "speaker_age": None, + "speaker_gender": aud_info[1][0], + }, + } + yield key, row + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") diff --git a/seacrowd/sea_datasets/tatabahasa/__init__.py b/seacrowd/sea_datasets/tatabahasa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/tatabahasa/tatabahasa.py b/seacrowd/sea_datasets/tatabahasa/tatabahasa.py new file mode 100644 index 000000000..b622e9398 --- /dev/null +++ b/seacrowd/sea_datasets/tatabahasa/tatabahasa.py @@ -0,0 +1,156 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This test is a general test for Malay grammar. Contains 349 questions that may be reinforced with instructions. +""" +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA + +_CITATION = None + +_DATASETNAME = "tatabahasa" + +_DESCRIPTION = """\ +This test is a general test for Malay grammar. Contains 349 questions. +""" + +_HOMEPAGE = "https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com" + +_LANGUAGES = ["zlm"] + +_LICENSE = Licenses.UNLICENSE.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/mesolitica/malaysian-dataset/master/llm-benchmark/tatabahasabm.tripod.com/quiz-tatabahasa.jsonl", +} + +_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class TatabahasaDataset(datasets.GeneratorBasedBuilder): + """This test is a general test for Malay grammar. Contains 349 questions.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + multi_choice = {"text" : datasets.Value("string"), "answer": datasets.Value("bool")} + features = datasets.Features({ + "question" : datasets.Value("string"), + "instruction": datasets.Value("string"), + "choices": { + "A": multi_choice, + "B": multi_choice, + "C": multi_choice, + "D": multi_choice, + }, + "website": datasets.Value("string") + }) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA}": + features = schemas.qa_features + features["meta"] = {"website": datasets.Value("string")} + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + with open(filepath ,'r') as f: + data = [json.loads(line) for line in f] + + if self.config.schema == "source": + for i in range(len(data)): + out = { + "question": data[i]["question"], + "instruction": data[i]["instruction"], + "choices": data[i]["choices"], + "website": data[i]["website"] + } + yield i, out + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA}": + for i in range(len(data)): + out = { + "id": i + 1, + "question_id": None, + "document_id": None, + "question": data[i]["question"], + "type": "multiple_choice", + "choices": [ + data[i]["choices"]["A"]["text"], + data[i]["choices"]["B"]["text"], + data[i]["choices"]["C"]["text"], + data[i]["choices"]["D"]["text"], + ], + "context": data[i]["instruction"], + "answer": [choice["text"] for choice in data[i]["choices"].values() if choice["answer"]], + "meta": {"website": data[i]["website"]}, + } + yield i, out diff --git a/seacrowd/sea_datasets/tcope/__init__.py b/seacrowd/sea_datasets/tcope/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/tcope/tcope.py b/seacrowd/sea_datasets/tcope/tcope.py new file mode 100644 index 000000000..8eb8810e1 --- /dev/null +++ b/seacrowd/sea_datasets/tcope/tcope.py @@ -0,0 +1,163 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@article{gonzales_broadening_2023, + author = {Gonzales, Wilkinson Daniel Wong}, + title = {Broadening horizons in the diachronic and sociolinguisstic study of + Philippine Englishes with the Twitter Corpus of Philippine Englishes (TCOPE)}, + journal = {English World-Wide}, + year = {2023}, + url = {https://osf.io/k3qzx}, + doi = {10.17605/OSF.IO/3Q5PW}, +} +""" + +_LOCAL = False +_LANGUAGES = ["eng", "fil"] +_DATASETNAME = "tcope" +_DESCRIPTION = """ +The TCOPE dataset consists of public tweets (amounting to about 13.5 million words) collected from 13 major cities from the Philippines. +Tweets are either purely in English or involve code-switching between English and Filipino. +Tweets are tagged for part-of-speech and dependency parsing using spaCy. Tweets collected are from 2010 to 2021. +The publicly available dataset is only a random sample (10%) from the whole TCOPE dataset, which consist of roughly 27 million tweets +(amounting to about 135 million words) collected from 29 major cities during the same date range. +""" + +_HOMEPAGE = "https://osf.io/3q5pw/wiki/home/" +_LICENSE = Licenses.CC0_1_0.value +_URL = "https://files.osf.io/v1/resources/3q5pw/providers/osfstorage/63737a5b0e715d3616a998f7" + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING, Tasks.DEPENDENCY_PARSING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class TCOPEDataset(datasets.GeneratorBasedBuilder): + """TCOPE is a dataset of Philippine English tweets by Gonzales (2023).""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + # Actual data has invalid "labels" likely due to coding errors, + # such as "BODY", "BIRTHDAY", "HAVAIANAS", etc. Only valid + # POS tags are included here and in loaded data. + POS_LABELS = ["NOUN", "PUNCT", "PROPN", "VERB", "PRON", "ADP", "ADJ", "ADV", "DET", "AUX", "PART", "CCONJ", "INTJ", "SPACE", "SCONJ", "NUM", "X", "SYM"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd sequence labeling schema", + schema="seacrowd_seq_label", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = "tcope_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "copeid": datasets.Value("string"), + "userid": datasets.Value("int64"), + "divided_tweet": datasets.Value("string"), + "postag": datasets.Value("string"), + "deptag": datasets.Value("string"), + "citycode": datasets.Value("string"), + "year": datasets.Value("int64"), + "extendedcope": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(label_names=self.POS_LABELS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # First ZIP contains second ZIP + # Second ZIP has spreadsheet data + folder_zip_dir = dl_manager.download_and_extract(_URL) + spreadsheet_zip_dir = dl_manager.extract(f"{folder_zip_dir}/public_v1/spreadsheet_format.zip") + spreadsheet_fp = f"{spreadsheet_zip_dir}/spreadsheet_format/tcope_v1_public_sample.csv" + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": spreadsheet_fp, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + if self.config.schema not in ("source", "seacrowd_seq_label"): + raise ValueError(f"Received unexpected config schema {self.config.schema}") + + df = pd.read_csv(filepath, index_col=None) + df = df.rename(columns={"divided.tweet": "divided_tweet"}).query("divided_tweet.notna()") + + for index, row in df.iterrows(): + if self.config.schema == "source": + example = row.to_dict() + elif self.config.schema == "seacrowd_seq_label": + tokens, tags = self.split_token_and_tag(row["postag"], valid_tags=self.POS_LABELS) + example = { + "id": str(index), + "tokens": tokens, + "labels": tags, + } + yield index, example + + def split_token_and_tag(self, tweet: str, valid_tags: List[str]) -> Tuple[List[str], List[str]]: + """Split tweet into two separate lists of tokens and tags.""" + tokens_with_tags = tweet.split() + tokens = [] + tags = [] + for indiv_token_with_tag in tokens_with_tags: + token, tag = indiv_token_with_tag.rsplit("_", 1) + tokens.append(token) + if tag in valid_tags: + tags.append(tag) + else: # Use "X"/other spaCy tag for invalid POS tags + tags.append("X") + return tokens, tags diff --git a/seacrowd/sea_datasets/tgl_profanity/__init__.py b/seacrowd/sea_datasets/tgl_profanity/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/tgl_profanity/tgl_profanity.py b/seacrowd/sea_datasets/tgl_profanity/tgl_profanity.py new file mode 100644 index 000000000..e69135c05 --- /dev/null +++ b/seacrowd/sea_datasets/tgl_profanity/tgl_profanity.py @@ -0,0 +1,115 @@ +import csv +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@article{galinato-etal-2023-context, + title="Context-Based Profanity Detection and Censorship Using Bidirectional Encoder Representations from Transformers", + author="Galinato, Valfrid and Amores, Lawrence and Magsino, Gino Ben and Sumawang, David Rafael", + month="jan", + year="2023" + url="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4341604" +} +""" + +_LOCAL = False +_LANGUAGES = ["tgl"] +_DATASETNAME = "tgl_profanity" +_DESCRIPTION = """\ +This dataset contains 13.8k Tagalog sentences containing profane words, together +with binary labels denoting whether or not the sentence conveys profanity / +abuse / hate speech. The data was scraped from Twitter using a Python library +called SNScrape and annotated manually by a panel of native Filipino speakers. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/mginoben/tagalog-profanity-dataset/" +_LICENSE = Licenses.UNKNOWN.value +_SUPPORTED_TASKS = [Tasks.ABUSIVE_LANGUAGE_PREDICTION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" +_URLS = { + "train": "https://huggingface.co/datasets/mginoben/tagalog-profanity-dataset/resolve/main/train.csv", + "val": "https://huggingface.co/datasets/mginoben/tagalog-profanity-dataset/resolve/main/val.csv", +} + + +class TagalogProfanityDataset(datasets.GeneratorBasedBuilder): + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + CLASS_LABELS = ["1", "0"] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "label": datasets.Value("int64"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(label_names=self.CLASS_LABELS) + else: + raise ValueError(f"Invalid config name: {self.config.schema}") + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_files = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_files["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_files["val"]}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yield examples as (key, example) tuples""" + with open(filepath, encoding="utf-8") as f: + csv_reader = csv.reader(f, delimiter=",") + next(csv_reader, None) # skip the headers + for idx, row in enumerate(csv_reader): + text, label = row + if self.config.schema == "source": + example = {"text": text, "label": int(label)} + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + example = {"id": idx, "text": text, "label": int(label)} + yield idx, example diff --git a/seacrowd/sea_datasets/tha_lao_embassy_parcor/__init__.py b/seacrowd/sea_datasets/tha_lao_embassy_parcor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/tha_lao_embassy_parcor/tha_lao_embassy_parcor.py b/seacrowd/sea_datasets/tha_lao_embassy_parcor/tha_lao_embassy_parcor.py new file mode 100644 index 000000000..4cd22cded --- /dev/null +++ b/seacrowd/sea_datasets/tha_lao_embassy_parcor/tha_lao_embassy_parcor.py @@ -0,0 +1,126 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +Wannaphong Phatthiyaphaibun. (2021). PyThaiNLP/Thai-Lao-Parallel-Corpus: \ +Thai Lao Parallel corpus v0.7 (v0.7). Zenodo \ +https://doi.org/10.5281/zenodo.5807093""" + +_DATASETNAME = "tha_lao_embassy_parcor" + +_DESCRIPTION = """\ +Thai-Lao Parallel Corpus contains equivalent Thai and Lao sentence pairs \ +derived from the website of the Royal Thai Embassy in Vientiane, Laos. +""" + +_HOMEPAGE = "https://github.com/PyThaiNLP/Thai-Lao-Parallel-Corpus/tree/master" +_LANGUAGES = ["tha", "lao"] +_LICENSE = Licenses.CC0_1_0.value + +_LOCAL = False +_URLS = {_DATASETNAME: "https://github.com/PyThaiNLP/Thai-Lao-Parallel-Corpus/raw/master/vientiane-thaiembassy-sent.csv"} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] +_SOURCE_VERSION = "0.7.0" +_SEACROWD_VERSION = "1.0.0" + + +class ThaLaoEmbassyParcorDataset(datasets.GeneratorBasedBuilder): + """Thai-Lao Parallel Corpus contains equivalent Thai and Lao sentence pairs \ + derived from the website of the Royal Thai Embassy in Vientiane, Laos.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "t2t" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "lao_sent": datasets.Value("string"), + "thai_sent": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + filename = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(filename), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + dataset = pd.read_csv(filepath) + + if self.config.schema == "source": + for i, row in dataset.iterrows(): + yield i, {"lao_sent": row["lao_sent"], "thai_sent": row["thai_sent"]} + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for i, row in dataset.iterrows(): + yield i, { + "id": i, + "text_1": row["lao_sent"], + "text_2": row["thai_sent"], + "text_1_name": "lao", + "text_2_name": "tha", + } diff --git a/seacrowd/sea_datasets/thai_alpaca/__init__.py b/seacrowd/sea_datasets/thai_alpaca/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_alpaca/thai_alpaca.py b/seacrowd/sea_datasets/thai_alpaca/thai_alpaca.py new file mode 100644 index 000000000..55f0c5b85 --- /dev/null +++ b/seacrowd/sea_datasets/thai_alpaca/thai_alpaca.py @@ -0,0 +1,108 @@ +from pathlib import Path +from typing import List + +import datasets +import pandas as pd +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# No paper citation found. +_CITATION = "" + +_LOCAL = False +_LANGUAGES = ["tha"] +_DATASETNAME = "thai_alpaca" +_DESCRIPTION = """\ +This is a Thai 🇹🇭-instructed dataset translated from cleaned version of the original +Alpaca Dataset released by Stanford using Google Cloud Translation, contain 52,000 +instructions and demonstrations generated by OpenAI's text-davinci-003 engine. This +instruction data can be used to conduct instruction-tuning for language models and +make the language model follow instruction better. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/Thaweewat/alpaca-cleaned-52k-th" +_LICENSE = Licenses.CC_BY_NC_4_0.value +_URL = "https://huggingface.co/datasets/Thaweewat/alpaca-cleaned-52k-th/resolve/main/alpaca-cleaned-th.parquet" +_SUPPORTED_TASKS = [Tasks.INSTRUCTION_TUNING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ThaiAlpacaDataset(datasets.GeneratorBasedBuilder): + """Thai Alpaca Dataset""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "t2t" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description="Thai-Alpaca source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description="Thai-Alpaca SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = "thai_alpaca_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "instruction": datasets.Value("string"), + "input": datasets.Value("string"), + "output": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_file = Path(dl_manager.download_and_extract(_URL)) + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_file})] + + def _generate_examples(self, filepath: Path): + """Yield examples as (key, example) tuples""" + df = pd.read_parquet(filepath) + for idx, row in df.iterrows(): + if self.config.schema == "source": + example = {"instruction": row.get("instruction"), "input": row.get("input"), "output": row.get("output")} + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + inputs = row.get("input") + if inputs: + text_1 = f"Context: {inputs}\n\n{row.get('instruction')}" + else: + text_1 = f"Context: {row.get('instruction')}" + + example = { + "id": str(idx), + "text_1": text_1, + "text_2": row.get("output"), + "text_1_name": "input_instruction", + "text_2_name": "output", + } + + yield idx, example diff --git a/seacrowd/sea_datasets/thai_constitution/__init__.py b/seacrowd/sea_datasets/thai_constitution/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_constitution/thai_constitution.py b/seacrowd/sea_datasets/thai_constitution/thai_constitution.py new file mode 100644 index 000000000..d0d20a971 --- /dev/null +++ b/seacrowd/sea_datasets/thai_constitution/thai_constitution.py @@ -0,0 +1,144 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, + DEFAULT_SOURCE_VIEW_NAME, Tasks, Licenses, TASK_TO_SCHEMA) + +_DATASETNAME = "thai_constitution" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME + +_CITATION = """\ +@misc{ + thaiconstitution, + title={Thai Constitution Corpus}, + url={https://github.com/PyThaiNLP/Thai-constitution-corpus}, + author={Wannaphong Phatthiyaphaibun} +} +""" + +_LOCAL = False + +_DESCRIPTION = """\ +Thailand's constitutional archive since 1932 +- Data collected from Office of the Council of State +- This project is part of the development plan PyThaiNLP +- The information collected in this text archive is in the public domain according to the Copyright Act 1994, Section 7 (The following are not considered copyrighted works under this Act: (1) daily news and Various facts which has the nature of being only news and not work in the literature department Science department or art department [...] (3) regulations, announcements, orders, clarifications and correspondence of ministries, bureaus, departments or any other government or local agencies [...]) +""" + +_HOMEPAGE = "https://github.com/PyThaiNLP/Thai-constitution-corpus/tree/master" + +_LICENSE = Licenses.CC0_1_0.value + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] +_SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() +_LANGUAGES = ['tha'] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +_URLS = [ + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B8%9B%E0%B8%81%E0%B8%84%E0%B8%A3%E0%B8%AD%E0%B8%87%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%AD%E0%B8%B2%E0%B8%93%E0%B8%B2%E0%B8%88%E0%B8%B1%E0%B8%81%E0%B8%A3%20%E0%B8%9E%E0%B8%B8%E0%B8%97%E0%B8%98%E0%B8%A8%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B8%B2%E0%B8%8A%202502.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B8%9B%E0%B8%81%E0%B8%84%E0%B8%A3%E0%B8%AD%E0%B8%87%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%AD%E0%B8%B2%E0%B8%93%E0%B8%B2%E0%B8%88%E0%B8%B1%E0%B8%81%E0%B8%A3%20%E0%B8%9E%E0%B8%B8%E0%B8%97%E0%B8%98%E0%B8%A8%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B8%B2%E0%B8%8A%202515.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B8%9B%E0%B8%81%E0%B8%84%E0%B8%A3%E0%B8%AD%E0%B8%87%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%AD%E0%B8%B2%E0%B8%93%E0%B8%B2%E0%B8%88%E0%B8%B1%E0%B8%81%E0%B8%A3%20%E0%B8%9E%E0%B8%B8%E0%B8%97%E0%B8%98%E0%B8%A8%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B8%B2%E0%B8%8A%202520.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B8%9B%E0%B8%81%E0%B8%84%E0%B8%A3%E0%B8%AD%E0%B8%87%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%AD%E0%B8%B2%E0%B8%93%E0%B8%B2%E0%B8%88%E0%B8%B1%E0%B8%81%E0%B8%A3%20%E0%B8%9E%E0%B8%B8%E0%B8%97%E0%B8%98%E0%B8%A8%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B8%B2%E0%B8%8A%202534.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%9E%E0%B8%A3%E0%B8%B0%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%9A%E0%B8%B1%E0%B8%8D%E0%B8%8D%E0%B8%B1%E0%B8%95%E0%B8%B4%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B8%9B%E0%B8%81%E0%B8%84%E0%B8%A3%E0%B8%AD%E0%B8%87%E0%B9%81%E0%B8%9C%E0%B9%88%E0%B8%99%E0%B8%94%E0%B8%B4%E0%B8%99%E0%B8%AA%E0%B8%A2%E0%B8%B2%E0%B8%A1%E0%B8%8A%E0%B8%B1%E0%B9%88%E0%B8%A7%E0%B8%84%E0%B8%A3%E0%B8%B2%E0%B8%A72475.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2475.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2489.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2492.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2511.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2517.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2519.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2521.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2534.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2540.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2550.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D2560.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B8%89%E0%B8%B0%E0%B8%9A%E0%B8%B1%E0%B8%9A%E0%B8%8A%E0%B8%B1%E0%B9%88%E0%B8%A7%E0%B8%84%E0%B8%A3%E0%B8%B2%E0%B8%A72490.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B9%81%E0%B8%AB%E0%B9%88%E0%B8%87%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%AD%E0%B8%B2%E0%B8%93%E0%B8%B2%E0%B8%88%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B9%84%E0%B8%97%E0%B8%A2%20(%E0%B8%89%E0%B8%9A%E0%B8%B1%E0%B8%9A%E0%B8%8A%E0%B8%B1%E0%B9%88%E0%B8%A7%E0%B8%84%E0%B8%A3%E0%B8%B2%E0%B8%A7)%20%E0%B8%9E%E0%B8%B8%E0%B8%97%E0%B8%98%E0%B8%A8%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B8%B2%E0%B8%8A%202549.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B9%81%E0%B8%AB%E0%B9%88%E0%B8%87%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%AD%E0%B8%B2%E0%B8%93%E0%B8%B2%E0%B8%88%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B9%84%E0%B8%97%E0%B8%A2%20(%E0%B8%89%E0%B8%9A%E0%B8%B1%E0%B8%9A%E0%B8%8A%E0%B8%B1%E0%B9%88%E0%B8%A7%E0%B8%84%E0%B8%A3%E0%B8%B2%E0%B8%A7)%20%E0%B8%9E%E0%B8%B8%E0%B8%97%E0%B8%98%E0%B8%A8%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B8%B2%E0%B8%8A%202557.txt", + "https://raw.githubusercontent.com/PyThaiNLP/Thai-constitution-corpus/master/data/%E0%B8%A3%E0%B8%B1%E0%B8%90%E0%B8%98%E0%B8%A3%E0%B8%A3%E0%B8%A1%E0%B8%99%E0%B8%B9%E0%B8%8D%E0%B9%81%E0%B8%AB%E0%B9%88%E0%B8%87%E0%B8%A3%E0%B8%B2%E0%B8%8A%E0%B8%AD%E0%B8%B2%E0%B8%93%E0%B8%B2%E0%B8%88%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B9%84%E0%B8%97%E0%B8%A2%20%E0%B8%9E%E0%B8%B8%E0%B8%97%E0%B8%98%E0%B8%A8%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B8%B2%E0%B8%8A%202475%20%E0%B9%81%E0%B8%81%E0%B9%89%E0%B9%84%E0%B8%82%E0%B9%80%E0%B8%9E%E0%B8%B4%E0%B9%88%E0%B8%A1%E0%B9%80%E0%B8%95%E0%B8%B4%E0%B8%A1%20%E0%B8%9E%E0%B8%B8%E0%B8%97%E0%B8%98%E0%B8%A8%E0%B8%B1%E0%B8%81%E0%B8%A3%E0%B8%B2%E0%B8%8A%202495.txt" + ] + + +class ThaiConstitutionDataset(datasets.GeneratorBasedBuilder): + """Thailand's constitutional archive since 1932""" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="thai_constitution_source", + version=datasets.Version(_SOURCE_VERSION), + description="Thai constitution source schema", + schema="source", + subset_id="thai_constitution", + ), + SEACrowdConfig( + name=f"thai_constitution_seacrowd_{_SEACROWD_SCHEMA_NAME}", + version=datasets.Version(_SEACROWD_VERSION), + description="Thai constitution SEACrowd schema", + schema=f"seacrowd_{_SEACROWD_SCHEMA_NAME}", + subset_id="thai_constitution", + ), + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{_SEACROWD_SCHEMA_NAME}": + features = schemas.self_supervised_pretraining.features + else: + raise ValueError(f"Invalid config schema: {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + filepaths = [Path(dl_manager.download(url)) for url in _URLS] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepaths": filepaths}, + ), + ] + + def _generate_examples(self, filepaths: List[Path]) -> Tuple[int, Dict]: + counter = 0 + for path in filepaths: + with open(path, encoding="utf-8") as f: + for line in f.readlines(): + if line.strip() == "": + continue + + if self.config.schema == "source": + yield ( + counter, + { + "id": str(counter), + "text": line.strip(), + }, + ) + elif self.config.schema == f"seacrowd_{_SEACROWD_SCHEMA_NAME}": + yield ( + counter, + { + "id": str(counter), + "text": line.strip(), + }, + ) + + counter += 1 diff --git a/seacrowd/sea_datasets/thai_databricks_dolly/__init__.py b/seacrowd/sea_datasets/thai_databricks_dolly/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_databricks_dolly/thai_databricks_dolly.py b/seacrowd/sea_datasets/thai_databricks_dolly/thai_databricks_dolly.py new file mode 100644 index 000000000..a43e6573d --- /dev/null +++ b/seacrowd/sea_datasets/thai_databricks_dolly/thai_databricks_dolly.py @@ -0,0 +1,114 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# No paper citation found. +_CITATION = "" + +_LOCAL = False +_LANGUAGES = ["tha"] +_DATASETNAME = "thai_databricks_dolly" +_DESCRIPTION = """\ +This is a Thai-instructed dataset translated from databricks-dolly-15k using +Google Cloud Translation. databricks-dolly-15k is an open-source dataset of +instruction-following records generated by thousands of Databricks employees in +several behavioral categories outlined in the InstructGPT paper, including +brainstorming, classification, closed QA, generation, information extraction, +open QA, and summarization. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/Thaweewat/databricks-dolly-15k-th" +_LICENSE = Licenses.CC_BY_SA_3_0.value +_URL = "https://huggingface.co/datasets/Thaweewat/databricks-dolly-15k-th/resolve/main/databricks-dolly-15k-th.parquet" +_SUPPORTED_TASKS = [Tasks.INSTRUCTION_TUNING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ThaiDatabricksDollyDataset(datasets.GeneratorBasedBuilder): + """Thai Databricks Dolly Dataset""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "t2t" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "instruction": datasets.Value("string"), + "context": datasets.Value("string"), + "response": datasets.Value("string"), + "category": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text2text_features + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_file = Path(dl_manager.download_and_extract(_URL)) + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_file})] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yield examples as (key, example) tuples""" + # pyarrow is an implicit dependency to load the parquet files + df = pd.read_parquet(filepath, engine="pyarrow") + for idx, row in df.iterrows(): + instruction = row.get("instruction").strip() + context = row.get("context").strip() + response = row.get("response").strip() + category = row.get("category").strip() + if self.config.schema == "source": + example = { + "instruction": instruction, + "context": context, + "response": response, + "category": category, + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + text_1 = f"Context: {context}\n\n{instruction}" if context else instruction + text_2 = response + example = { + "id": str(idx), + "text_1": text_1, + "text_2": text_2, + "text_1_name": "context_and_instruction", + "text_2_name": "response", + } + + yield idx, example diff --git a/seacrowd/sea_datasets/thai_depression/__init__.py b/seacrowd/sea_datasets/thai_depression/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_depression/thai_depression.py b/seacrowd/sea_datasets/thai_depression/thai_depression.py new file mode 100644 index 000000000..e21a120c6 --- /dev/null +++ b/seacrowd/sea_datasets/thai_depression/thai_depression.py @@ -0,0 +1,145 @@ +import json +from pathlib import Path +from typing import List + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import DEFAULT_SEACROWD_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Licenses, Tasks + +_DATASETNAME = "thai_depression" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME + +_LANGUAGES = ["tha"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LOCAL = False +_CITATION = """\ +@inproceedings{hamalainen-etal-2021-detecting, + title = "Detecting Depression in Thai Blog Posts: a Dataset and a Baseline", + author = {H{\"a}m{\"a}l{\"a}inen, Mika and + Patpong, Pattama and + Alnajjar, Khalid and + Partanen, Niko and + Rueter, Jack}, + editor = "Xu, Wei and + Ritter, Alan and + Baldwin, Tim and + Rahimi, Afshin", + booktitle = "Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)", + month = nov, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.wnut-1.3", + doi = "10.18653/v1/2021.wnut-1.3", + pages = "20--25", + abstract = "We present the first openly available corpus for detecting depression in Thai. Our corpus is compiled by expert verified cases of depression in several online blogs. + We experiment with two different LSTM based models and two different BERT based models. We achieve a 77.53%% accuracy with a Thai BERT model in detecting depression. + This establishes a good baseline for future researcher on the same corpus. Furthermore, we identify a need for Thai embeddings that have been trained on a more varied corpus than Wikipedia. + Our corpus, code and trained models have been released openly on Zenodo.", +} +""" + +_DESCRIPTION = """\ +We present the first openly available corpus for detecting depression in Thai. Our corpus is compiled by expert verified cases of depression in several online blogs. +We experiment with two different LSTM based models and two different BERT based models. We achieve a 77.53%% accuracy with a Thai BERT model in detecting depression. +This establishes a good baseline for future researcher on the same corpus. Furthermore, we identify a need for Thai embeddings that have been trained on a more varied corpus than Wikipedia. +Our corpus, code and trained models have been released openly on Zenodo. +""" + +_HOMEPAGE = "https://zenodo.org/records/4734552" + +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value + +_URLs = "https://zenodo.org/records/4734552/files/data.zip?download=1" + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ThaiDepressionDataset(datasets.GeneratorBasedBuilder): + """Thai depression detection dataset.""" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} seacrowd schema", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "label": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(["depression", "no_depression"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + path = Path(dl_manager.download_and_extract(_URLs)) + data_files = { + "train": path / "splits/train.json", + "test": path / "splits/test.json", + "valid": path / "splits/valid.json", + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_files["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_files["valid"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_files["test"]}, + ), + ] + + def _parse_and_label(self, file_path): + with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + parsed_data = [] + for item in data: + parsed_data.append({"text": item[0], "label": item[1]}) + + return parsed_data + + def _generate_examples(self, filepath: Path): + print("Reading ", filepath) + for id, row in enumerate(self._parse_and_label(filepath)): + if self.config.schema == "source": + yield id, {"text": row["text"], "label": row["label"]} + elif self.config.schema == "seacrowd_text": + yield id, {"id": str(id), "text": row["text"], "label": row["label"]} + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/thai_gpteacher/__init__.py b/seacrowd/sea_datasets/thai_gpteacher/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_gpteacher/thai_gpteacher.py b/seacrowd/sea_datasets/thai_gpteacher/thai_gpteacher.py new file mode 100644 index 000000000..142e42848 --- /dev/null +++ b/seacrowd/sea_datasets/thai_gpteacher/thai_gpteacher.py @@ -0,0 +1,118 @@ +from pathlib import Path + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + + +_CITATION = "" + +_DATASETNAME = "thai_gpteacher" + +_DESCRIPTION = """This is a Thai-instructed dataset translated using Google Cloud Translation from GPTeacher, a +collection of modular datasets generated by GPT-4, General-Instruct & Roleplay-Instruct and is comprised of around +20,000 examples with deduplication. The dataset was asked to include reasoning and thought steps in the example +responses where appropriate. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/Thaweewat/gpteacher-20k-th" + +_LANGUAGES = ["tha"] + +_LICENSE = Licenses.CC_BY_SA_3_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + "train": { + "qa": "https://huggingface.co/datasets/Thaweewat/gpteacher-20k-th/resolve/main/gpteacher-gpt4-instruct-qa-18k-th.parquet", + "role_play": "https://huggingface.co/datasets/Thaweewat/gpteacher-20k-th/resolve/main/gpteacher-gpt4-instruct-roleplay-2k-th.parquet", + } + }, +} + +_SUPPORTED_TASKS = [Tasks.INSTRUCTION_TUNING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class ThaiGPTeacherDataset(datasets.GeneratorBasedBuilder): + """Thai-instructed dataset translated using Google Cloud Translation from GPTeacher.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "instruction": datasets.Value("string"), + "input": datasets.Value("string"), + "output": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> list[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_dir, "split": "train"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> tuple[int, dict]: + df1 = pd.read_parquet(filepath[split]["qa"]) + df2 = pd.read_parquet(filepath[split]["role_play"]) + df = pd.concat([df1, df2], ignore_index=True) + if self.config.schema == "source": + for i, row in df.iterrows(): + yield i, {"instruction": row["instruction"], "input": row["input"], "output": row["output"]} + + elif self.config.schema == "seacrowd_t2t": + for i, row in df.iterrows(): + yield i, { + "id": str(i), + "text_1": row["instruction"] + "\n" + row["input"], + "text_2": row["output"], + "text_1_name": "instruction + input", + "text_2_name": "output", + } diff --git a/seacrowd/sea_datasets/thai_hh_rlhf/__init__.py b/seacrowd/sea_datasets/thai_hh_rlhf/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_hh_rlhf/thai_hh_rlhf.py b/seacrowd/sea_datasets/thai_hh_rlhf/thai_hh_rlhf.py new file mode 100644 index 000000000..a2f431965 --- /dev/null +++ b/seacrowd/sea_datasets/thai_hh_rlhf/thai_hh_rlhf.py @@ -0,0 +1,122 @@ +from pathlib import Path +from typing import List + +import datasets +import pandas as pd +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# No paper citation found. +_CITATION = """\ +@article{bai2022training, + title={Training a helpful and harmless assistant with reinforcement learning from human feedback}, + author={Bai, Yuntao and Jones, Andy and Ndousse, Kamal and Askell, Amanda and Chen, Anna and DasSarma, Nova and Drain, Dawn and Fort, Stanislav and Ganguli, Deep and Henighan, Tom and others}, + journal={arXiv preprint arXiv:2204.05862}, + year={2022} +} + +""" + +_LOCAL = False +_LANGUAGES = ["tha"] +_DATASETNAME = "thai_hh_rlhf" +_DESCRIPTION = """\ +This is a Thai-translated dataset based on Anthropic/hh-rlhf using Google Cloud Translation. This repository +provides access to 161K Train dataset Anthropic/hh-rlhf (Thai-translated). + +The original repository provides access to two different kinds of data: + +1. Human preference data about helpfulness and harmlessness from Training a Helpful and Harmless Assistant with +Reinforcement Learning from Human Feedback. These data are meant to train preference (or reward) models for +subsequent RLHF training. These data are not meant for supervised training of dialogue agents. Training dialogue +agents on these data is likely to lead to harmful models and this shold be avoided. + +2. Human-generated and annotated red teaming dialogues from Red Teaming Language Models to Reduce Harms: Methods, +Scaling Behaviors, and Lessons Learned. These data are meant to understand how crowdworkers red team models and +what types of red team attacks are succesful or not. The data are not meant for fine-tuning or preference modeling +(use the data above for preference modeling). These data are entire transcripts of conversations that are derived +from the harmlessness preference modeling data described above, where only the chosen response is incorporated into +the overall transcript. Furthermore, the transcripts are annotated with human and automated measurements of how +harmful the overall dialogues are. + +The translated data corresponds to the first kind of data. +""" + +_HOMEPAGE = " https://huggingface.co/datasets/Thaweewat/hh-rlhf-th" +_LICENSE = Licenses.MIT.value +_URL = "https://huggingface.co/datasets/Thaweewat/hh-rlhf-th/resolve/main/hh-rlhf-train-161k-th.parquet" +_SUPPORTED_TASKS = [Tasks.REINFORCEMENT_LEARNING_WITH_HUMAN_FEEDBACK] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ThaiHhRlhfDataset(datasets.GeneratorBasedBuilder): + """Thai_HH_RLHF Dataset""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description="Thai_HH_RLHF source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description="Thai_HH_RLHF SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "chosen": datasets.Value("string"), + "rejected": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text.features(label_names=["rejected", "chosen"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_file = Path(dl_manager.download_and_extract(_URL)) + return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_file})] + + def _generate_examples(self, filepath: Path): + """Yield examples as (key, example) tuples""" + df = pd.read_parquet(filepath) + for idx, row in df.iterrows(): + if self.config.schema == "source": + example = {"chosen": row.get("chosen"), "rejected": row.get("rejected")} + + yield idx, example + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for i, label in enumerate(["chosen", "rejected"]): + text = row.get(label) + + example = {"id": str(idx), "text": text, "label": label} + + yield (idx * 2) + i, example diff --git a/seacrowd/sea_datasets/thai_sum/__init__.py b/seacrowd/sea_datasets/thai_sum/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_sum/thai_sum.py b/seacrowd/sea_datasets/thai_sum/thai_sum.py new file mode 100644 index 000000000..43dd4ac14 --- /dev/null +++ b/seacrowd/sea_datasets/thai_sum/thai_sum.py @@ -0,0 +1,144 @@ +import csv +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses +from seacrowd.utils import schemas + +_CITATION = """\ +@mastersthesis{chumpolsathien_2020, + title={Using Knowledge Distillation from Keyword Extraction to Improve the Informativeness of Neural Cross-lingual Summarization}, + author={Chumpolsathien, Nakhun}, + year={2020}, + school={Beijing Institute of Technology} +""" + +_DATASETNAME = "thai_sum" + +_DESCRIPTION = """ +We present ThaiSum, a large-scale corpus for Thai text summarization obtained from several online news websites namely Thairath, ThaiPBS, Prachathai, and The Standard. This dataset consists of over 350,000 article and summary pairs written by journalists. +""" + +_HOMEPAGE = "https://github.com/nakhunchumpolsathien/ThaiSum" + +_LICENSE = Licenses.MIT.value + +_LANGUAGES = ["tha"] + +_LOCAL = False + +_URLS = { + "train": "https://nakhun-chumpolsathien.oss-us-west-1.aliyuncs.com/thaisum/thaisum.csv", + "val": "https://nakhun-chumpolsathien.oss-us-west-1.aliyuncs.com/thaisum/validation_set.csv", + "test": "https://nakhun-chumpolsathien.oss-us-west-1.aliyuncs.com/thaisum/test_set.csv", +} + +_SUPPORTED_TASKS = [Tasks.SUMMARIZATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class ThaiSumDataset(datasets.GeneratorBasedBuilder): + """ + Sequence-to-sequence (Seq2Seq) models have shown great achievement in text summarization. + However, Seq2Seq model often requires large-scale training data to achieve effective results. + Although many impressive advancements in text summarization field have been made, + most of summarization studies focus on resource-rich languages. + The progress of Thai text summarization is still far behind. + The dearth of large-scale dataset keeps Thai text summarization in its infancy. + As far as our knowledge goes, there is not a large-scale dataset for Thai text summarization available anywhere. + Thus, we present ThaiSum, a large-scale corpus for Thai text summarization + obtained from several online news websites namely Thairath, ThaiPBS, Prachathai, and The Standard. + This dataset consists of over 350,000 article and summary pairs written by journalists. + We evaluate the performance of various existing summarization models on ThaiSum dataset and analyse + the characteristic of the dataset to present its difficulties. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"title": datasets.Value("string"), "body": datasets.Value("string"), "summary": datasets.Value("string"), "type": datasets.Value("string"), "tags": datasets.Value("string"), "url": datasets.Value("string")}) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + path_dict = dl_manager.download_and_extract(_URLS) + train_path, val_path, test_path = path_dict["train"], path_dict["val"], path_dict["test"] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_path, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": test_path}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": val_path, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + csv.field_size_limit(int(1000000)) + with open(filepath, encoding="utf-8") as f: + csv_reader = csv.reader(f) + next(csv_reader) # skip header + if self.config.schema == "source": + for id_, row in enumerate(csv_reader): + yield id_, { + "title": row[0], + "body": row[1], + "summary": row[2], + "type": row[3], + "tags": row[4], + "url": row[5], + } + elif self.config.schema == "seacrowd_t2t": + for id_, row in enumerate(csv_reader): + yield id_, { + "id": str(id_), + "text_1": row[1], + "text_2": row[2], + "text_1_name": "document", + "text_2_name": "summary", + } diff --git a/seacrowd/sea_datasets/thai_toxicity_tweet/__init__.py b/seacrowd/sea_datasets/thai_toxicity_tweet/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_toxicity_tweet/thai_toxicity_tweet.py b/seacrowd/sea_datasets/thai_toxicity_tweet/thai_toxicity_tweet.py new file mode 100644 index 000000000..ab5bc59a5 --- /dev/null +++ b/seacrowd/sea_datasets/thai_toxicity_tweet/thai_toxicity_tweet.py @@ -0,0 +1,120 @@ +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, TASK_TO_SCHEMA, Tasks + +_CITATION = """\ +@inproceedings{sirihattasak2018annotation, + title = {Annotation and Classification of Toxicity for Thai Twitter}, + author = {Sirihattasak, Sugan and Komachi, Mamoru and Ishikawa, Hiroshi}, + year = {2018}, + booktitle = {Proceedings of LREC 2018 Workshop and the 2nd Workshop on Text Analytics for Cybersecurity and Online Safety (TA-COS’18)}, + address = {Miyazaki, Japan}, +} +""" + +_LOCAL = False +_LANGUAGES = ["tha"] +_DATASETNAME = "thai_toxicity_tweet" +_DESCRIPTION = """\ +The Thai Toxicity Tweet Corpus contains 3,300 tweets (506 tweets with texts missing) annotated by humans with guidelines +including a 44-word dictionary. The author acquired 2,027 toxic and 1,273 non-toxic tweets, which were +manually labeled by three annotators. Toxicity is defined by the author as any message conveying harmful, +damaging, or negative intent, in accordance with their defined criteria for toxicity. +""" + +_HOMEPAGE = "https://github.com/tmu-nlp/ThaiToxicityTweetCorpus" +_LICENSE = Licenses.CC_BY_NC_4_0.value +_URL = "https://huggingface.co/datasets/thai_toxicity_tweet" + +_SUPPORTED_TASKS = [Tasks.ABUSIVE_LANGUAGE_PREDICTION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ThaiToxicityTweetsDataset(datasets.GeneratorBasedBuilder): + """Dataset of Thai tweets annotated for toxicity.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + CLASS_LABELS = [0, 1] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "tweet_id": datasets.Value("string"), + "tweet_text": datasets.Value("string"), + "toxic_votes": datasets.Value("int32"), + "nontoxic_votes": datasets.Value("int32"), + "is_toxic": datasets.ClassLabel(names=self.CLASS_LABELS), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA}": + features = schemas.text_features(label_names=self.CLASS_LABELS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + # dl_manager not used since dataloader uses HF `load_dataset` + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"split": "train"}, + ), + ] + + def _load_hf_data_from_remote(self) -> datasets.DatasetDict: + """Load dataset from HuggingFace.""" + HF_REMOTE_REF = "/".join(_URL.split("/")[-1:]) + _hf_dataset_source = datasets.load_dataset(HF_REMOTE_REF, split="train") + return _hf_dataset_source + + def _generate_examples(self, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data = self._load_hf_data_from_remote() + index = 0 + for row in data: + if row["tweet_text"] in ("TWEET_NOT_FOUND", ""): + continue + if self.config.schema == "source": + example = row + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA}": + example = { + "id": str(index), + "text": row["tweet_text"], + "label": row["is_toxic"], + } + yield index, example + index += 1 diff --git a/seacrowd/sea_datasets/thaigov/thaigov.py b/seacrowd/sea_datasets/thaigov/thaigov.py new file mode 100644 index 000000000..ff9888bf8 --- /dev/null +++ b/seacrowd/sea_datasets/thaigov/thaigov.py @@ -0,0 +1,196 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The dataset consists of individual news articles, each corresponding to a unique URL at the +Thai government website (https://www.thaigov.go.th/). The dataset structure is as follows: a topic header is +followed by the content of the news article, which is then succeeded by a blank line and the source URL +""" +import glob +import os +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import jsonlines + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{, + author = {PyThaiNLP}, + title = {thaigov-v2-corpus}, + journal = {}, + volume = {}, + year = {2023}, + url = {https://github.com/PyThaiNLP/thaigov-v2-corpus/tree/master}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +_DATASETNAME = "thaigov" + +_DESCRIPTION = """\ +This dataset is a corpus from ThaiGov. +""" + +_HOMEPAGE = "https://github.com/PyThaiNLP/thaigov-v2-corpus/tree/master/data" + +_LANGUAGES = ["tha"] + +_LICENSE = Licenses.PDDL.value + +_LOCAL = False + + +_URLS = { + _DATASETNAME: "https://github.com/PyThaiNLP/thaigov-v2-corpus/archive/refs/heads/master.zip", +} + +_SUPPORTED_TASKS = [Tasks.SUMMARIZATION] + +_SOURCE_VERSION = "2.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class NewDataset(datasets.GeneratorBasedBuilder): + """This dataset is a corpus from ThaiGov, can be used for summarization tasks.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="thaigov_source", + version=SOURCE_VERSION, + description="thaigov source schema", + schema="source", + subset_id="thaigov", + ), + SEACrowdConfig( + name="thaigov_seacrowd_t2t", + version=SEACROWD_VERSION, + description="thaigov SEACrowd schema", + schema="seacrowd_t2t", + subset_id="thaigov", + ), + ] + + DEFAULT_CONFIG_NAME = "thaigov_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "src": datasets.Value("string"), + "tgt": datasets.Value("string"), + "url": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + # Since the data is stored based on date extracted, it will follow the pattern data/year/month/day/{article_names}.txt + list_all_txt_files = list(glob.glob(os.path.join(data_dir, "thaigov-v2-corpus-master", "data", "*", "*", "*", "*.txt"))) + all_data = [] + counter = 0 + for i in list_all_txt_files: + d = self._read_file(i) + all_data.append({"id": counter, "src": d["context"], "tgt": d["title"], "url": d["url"]}) + counter += 1 + + self._write_jsonl(data_dir + "/train.jsonl", all_data) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.jsonl"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + i = 0 + with jsonlines.open(filepath) as f: + for each_data in f.iter(): + ex = { + "id": each_data["id"], + "src": each_data["src"], + "tgt": each_data["tgt"], + "url": each_data["url"], + } + yield i, ex + i += 1 + + elif self.config.schema == "seacrowd_t2t": + i = 0 + with jsonlines.open(filepath) as f: + for each_data in f.iter(): + ex = {"id": each_data["id"], "text_1": each_data["src"], "text_2": each_data["tgt"], "text_1_name": "input_document", "text_2_name": "output_summary"} + yield i, ex + i += 1 + + def _read_file(self, path): + text = {"title": "", "context": "", "url": ""} + page_view_line = 0 + with open(path, "r", encoding="utf-8-sig") as f: + for n, line in enumerate(f): + line = line.strip() + if n == 0: # title line + text["title"] = line.strip() + else: + if line: + if re.match(r"^[\d,]+$", line): + page_view_line = n + continue + if line == "พิมพ์" or page_view_line and page_view_line < n: # skip 'print' + continue + if re.match(r"^ที่มา : http", line): + text["url"] = line.strip().split(" ")[-1] + else: + text["context"] += line.strip().replace("\xa0", "") + "\n" + return text + + def _write_jsonl(self, filepath, values): + with jsonlines.open(filepath, "w") as writer: + for line in values: + writer.write(line) diff --git a/seacrowd/sea_datasets/tico_19/tico_19.py b/seacrowd/sea_datasets/tico_19/tico_19.py index 0630532c7..438cba5d6 100644 --- a/seacrowd/sea_datasets/tico_19/tico_19.py +++ b/seacrowd/sea_datasets/tico_19/tico_19.py @@ -14,14 +14,13 @@ # limitations under the License. import csv -from fnmatch import translate import os import re from pathlib import Path from typing import Dict, List, Tuple -from translate.storage.tmx import tmxfile import datasets +from translate.storage.tmx import tmxfile from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig @@ -59,34 +58,50 @@ """ # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) -_LANGUAGES = ["ind", "ara", "spa", "fra", "hin", "por", "rus", "zho", "eng"] +_LANGUAGES = ["ind", "ara", "spa", "fra", "hin", "por", "rus", "zho", "eng", "khm", "zlm", "mya", "tgl", "tam"] _LOCAL = False _SUPPORTED_LANG_PAIRS = [ - ("ind", "ara"), ("ind", "spa"), ("ind", "fra"), ("ind", "hin"), ("ind", "por"), ("ind", "rus"), ("ind", "zho"), ("ind", "eng"), - ("ara", "ind"), ("spa", "ind"), ("fra", "ind"), ("hin", "ind"), ("por", "ind"), ("rus", "ind"), ("zho", "ind"), ("eng", "ind") + ("ind", "ara"), + ("ind", "spa"), + ("ind", "fra"), + ("ind", "hin"), + ("ind", "por"), + ("ind", "rus"), + ("ind", "zho"), + ("ind", "eng"), + ("ara", "ind"), + ("spa", "ind"), + ("fra", "ind"), + ("hin", "ind"), + ("por", "ind"), + ("rus", "ind"), + ("zho", "ind"), + ("eng", "ind"), + ("khm", "eng"), + ("eng", "khm"), + ("mya", "eng"), + ("eng", "mya"), + ("zlm", "eng"), + ("eng", "zlm"), + ("tgl", "eng"), + ("eng", "tgl"), + ("tam", "eng"), + ("eng", "tam"), ] -_LANG_CODE_MAP = { - "ind": "id", - "ara": "ar", - "spa": "es-LA", - "fra": "fr", - "hin": "hi", - "por": "pt-BR", - "rus": "ru", - "zho": "zh", - "eng": "en" -} +_LANG_CODE_MAP = {"ind": "id", "ara": "ar", "spa": "es-LA", "fra": "fr", "hin": "hi", "por": "pt-BR", "rus": "ru", "zho": "zh", "eng": "en", "khm": "km", "zlm": "ms", "mya": "my", "tgl": "tl", "tam": "ta"} + +_DEVTEST_LANG_PAIRS = [_LANG_CODE_MAP[source_lang] + "-" + _LANG_CODE_MAP[target_lang] for (source_lang, target_lang) in _SUPPORTED_LANG_PAIRS if (source_lang == "eng" or target_lang == "eng")] _DATASETNAME = "tico_19" _DESCRIPTION = """\ -TICO-19 (Translation Initiative for COVID-19) is sampled from a variety of public sources containing -COVID-19 related content, representing different domains (e.g., news, wiki articles, and others). TICO-19 -includes 30 documents (3071 sentences, 69.7k words) translated from English into 36 languages: Amharic, -Arabic (Modern Standard), Bengali, Chinese (Simplified), Dari, Dinka, Farsi, French (European), Hausa, -Hindi, Indonesian, Kanuri, Khmer (Central), Kinyarwanda, Kurdish Kurmanji, Kurdish Sorani, Lingala, -Luganda, Malay, Marathi, Myanmar, Nepali, Nigerian Fulfulde, Nuer, Oromo, Pashto, Portuguese (Brazilian), +TICO-19 (Translation Initiative for COVID-19) is sampled from a variety of public sources containing +COVID-19 related content, representing different domains (e.g., news, wiki articles, and others). TICO-19 +includes 30 documents (3071 sentences, 69.7k words) translated from English into 36 languages: Amharic, +Arabic (Modern Standard), Bengali, Chinese (Simplified), Dari, Dinka, Farsi, French (European), Hausa, +Hindi, Indonesian, Kanuri, Khmer (Central), Kinyarwanda, Kurdish Kurmanji, Kurdish Sorani, Lingala, +Luganda, Malay, Marathi, Myanmar, Nepali, Nigerian Fulfulde, Nuer, Oromo, Pashto, Portuguese (Brazilian), Russian, Somali, Spanish (Latin American), Swahili, Congolese Swahili, Tagalog, Tamil, Tigrinya, Urdu, Zulu. """ @@ -94,10 +109,7 @@ _LICENSE = "CC0" -_URLS = { - "evaluation": "https://tico-19.github.io/data/tico19-testset.zip", - "all": "https://tico-19.github.io/data/TM/all.{lang_pairs}.tmx.zip" -} +_URLS = {"evaluation": "https://tico-19.github.io/data/tico19-testset.zip", "all": "https://tico-19.github.io/data/TM/all.{lang_pairs}.tmx.zip"} _SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] @@ -128,16 +140,14 @@ def seacrowd_config_constructor(lang_source, lang_target, schema, version): subset_id="tico_19", ) + class Tico19(datasets.GeneratorBasedBuilder): """TICO-19 is MT dataset sampled from a variety of public sources containing COVID-19 related content""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - BUILDER_CONFIGS = [ - seacrowd_config_constructor(src, tgt, schema, version) - for src, tgt in [("", "")] + _SUPPORTED_LANG_PAIRS for schema, version in zip(["source", "seacrowd_t2t"], [_SOURCE_VERSION, _SEACROWD_VERSION]) - ] + BUILDER_CONFIGS = [seacrowd_config_constructor(src, tgt, schema, version) for src, tgt in [("", "")] + _SUPPORTED_LANG_PAIRS for schema, version in zip(["source", "seacrowd_t2t"], [_SOURCE_VERSION, _SEACROWD_VERSION])] DEFAULT_CONFIG_NAME = "tico_19_source" @@ -168,7 +178,7 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - + try: lang_pairs_config = re.search("tico_19_(.+?)_(source|seacrowd_t2t)", self.config.name).group(1) lang_src, lang_tgt = lang_pairs_config.split("_") @@ -177,25 +187,19 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase lang_pairs = _LANG_CODE_MAP[lang_src] + "-" + _LANG_CODE_MAP[lang_tgt] - # dev & test split only applicable to eng-ind language pair - if lang_pairs in ["en-id", "id-en"]: + # dev & test split only applicable to eng-[sea language] language pair + if lang_pairs in set(_DEVTEST_LANG_PAIRS): + lang_sea = _LANG_CODE_MAP[lang_tgt] if lang_src == "eng" else _LANG_CODE_MAP[lang_src] + data_dir = dl_manager.download_and_extract(_URLS["evaluation"]) return [ datasets.SplitGenerator( name=datasets.Split.TEST, - gen_kwargs={ - "filepath": os.path.join(data_dir, "tico19-testset", "test", f"test.en-id.tsv"), - "lang_source": lang_src, - "lang_target": lang_tgt - }, + gen_kwargs={"filepath": os.path.join(data_dir, "tico19-testset", "test", f"test.en-{lang_sea}.tsv"), "lang_source": lang_src, "lang_target": lang_tgt}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": os.path.join(data_dir, "tico19-testset", "dev", f"dev.en-id.tsv"), - "lang_source": lang_src, - "lang_target": lang_tgt - }, + gen_kwargs={"filepath": os.path.join(data_dir, "tico19-testset", "dev", f"dev.en-{lang_sea}.tsv"), "lang_source": lang_src, "lang_target": lang_tgt}, ), ] else: @@ -203,20 +207,16 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": os.path.join(data_dir, f"all.{lang_pairs}.tmx"), - "lang_source": lang_src, - "lang_target": lang_tgt - }, + gen_kwargs={"filepath": os.path.join(data_dir, f"all.{lang_pairs}.tmx"), "lang_source": lang_src, "lang_target": lang_tgt}, ) ] def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - + if self.config.schema == "source": - # eng-ind language pair dataset provided in .tsv format - if (lang_source == "eng" and lang_target == "ind") or (lang_source == "ind" and lang_target == "eng"): + # eng-[sea language] language pair dataset provided in .tsv format + if f"{_LANG_CODE_MAP[lang_source]}-{_LANG_CODE_MAP[lang_target]}" in set(_DEVTEST_LANG_PAIRS): with open(filepath, encoding="utf-8") as f: reader = csv.reader(f, delimiter="\t", quotechar='"') for id_, row in enumerate(reader): @@ -242,7 +242,7 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str) "license": row[6], "translatorId": row[7], } - + # all language pairs except eng-ind dataset provided in .tmx format else: with open(filepath, "rb") as f: @@ -250,8 +250,8 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str) for id_, node in enumerate(tmx_file.unit_iter()): try: - url = [text for text in node.xmlelement.itertext('prop')][0] - except: + url = [text for text in node.xmlelement.itertext("prop")][0] + except Exception: url = "" yield id_, { "sourceLang": _LANG_CODE_MAP[lang_source], @@ -265,7 +265,7 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str) } elif self.config.schema == "seacrowd_t2t": - if (lang_source == "eng" and lang_target == "ind") or (lang_source == "ind" and lang_target == "eng"): + if f"{_LANG_CODE_MAP[lang_source]}-{_LANG_CODE_MAP[lang_target]}" in set(_DEVTEST_LANG_PAIRS): with open(filepath, encoding="utf-8") as f: reader = csv.reader(f, delimiter="\t", quotechar='"') for id_, row in enumerate(reader): @@ -277,22 +277,10 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str) else: source_string = row[3] target_string = row[2] - yield id_, { - "id": row[4], - "text_1": source_string, - "text_2": target_string, - "text_1_name": lang_source, - "text_2_name": lang_target - } + yield id_, {"id": row[4], "text_1": source_string, "text_2": target_string, "text_1_name": lang_source, "text_2_name": lang_target} else: with open(filepath, "rb") as f: tmx_file = tmxfile(f) - + for id_, node in enumerate(tmx_file.unit_iter()): - yield id_, { - "id": node.getid(), - "text_1": node.source, - "text_2": node.target, - "text_1_name": lang_source, - "text_2_name": lang_target - } + yield id_, {"id": node.getid(), "text_1": node.source, "text_2": node.target, "text_1_name": lang_source, "text_2_name": lang_target} diff --git a/seacrowd/sea_datasets/tmad_malay_corpus/tmad_malay_corpus.py b/seacrowd/sea_datasets/tmad_malay_corpus/tmad_malay_corpus.py new file mode 100644 index 000000000..88afbf428 --- /dev/null +++ b/seacrowd/sea_datasets/tmad_malay_corpus/tmad_malay_corpus.py @@ -0,0 +1,140 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The Towards Malay Abbreviation Disambiguation (TMAD) Malay Corpus includes sentences from Malay news sites with abbreviations and their meanings. Only abbreviations with more than one possible meaning are included. +""" +import csv +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{article, +author = {Ciosici, Manuel and Sommer, Tobias}, +year = {2019}, +month = {04}, +pages = {}, +title = {Unsupervised Abbreviation Disambiguation Contextual disambiguation using word embeddings} +} +""" + +_DATASETNAME = "tmad_malay_corpus" + +_DESCRIPTION = """\ +The Towards Malay Abbreviation Disambiguation (TMAD) Malay Corpus includes sentences from Malay news sites with abbreviations and their meanings. Only abbreviations with more than one possible meaning are included. +""" + +_HOMEPAGE = "https://github.com/bhysss/TMAD-CUM/tree/master" + +_LANGUAGES = ["zlm"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + "train": "https://raw.githubusercontent.com/bhysss/TMAD-CUM/master/data/Malay/data_train.csv", + "dev": "https://raw.githubusercontent.com/bhysss/TMAD-CUM/master/data/Malay/data_dev.csv", + "test": "https://raw.githubusercontent.com/bhysss/TMAD-CUM/master/data/Malay/data_test.csv", + "dict": "https://raw.githubusercontent.com/bhysss/TMAD-CUM/master/data/Malay/May_dic.json", +} +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class TMADMalayCorpusDataset(datasets.GeneratorBasedBuilder): + """Abbreviation disambiguation dataset from Malay news sites.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description="{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_qa", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"abbr": datasets.Value("string"), "definition": datasets.Value("string"), "sentence": datasets.Value("string"), "choices": datasets.Sequence(datasets.Value("string"))}) + + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_dirs = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={"filepath": data_dirs["train"], "dictpath": data_dirs["dict"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_dirs["test"], "dictpath": data_dirs["dict"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_dirs["dev"], "dictpath": data_dirs["dict"]}, + ), + ] + + def _generate_examples(self, filepath: Path, dictpath: Path) -> Tuple[int, Dict]: + + with open(dictpath) as f: + may_dict = json.load(f) + + if self.config.schema == "source": + with open(filepath, encoding="utf-8") as f: + for row_idx, row in enumerate(csv.DictReader(f)): + yield row_idx, {"abbr": row["Abbr"], "definition": row["Definition"], "sentence": row["Sentence"], "choices": may_dict[row["Abbr"]]} + + elif self.config.schema == "seacrowd_qa": + with open(filepath, encoding="utf-8") as f: + for row_idx, row in enumerate(csv.DictReader(f)): + yield row_idx, {"id": row_idx, "question_id": 0, "document_id": 0, "question": row["Abbr"], "type": "multiple_choice", "choices": may_dict[row["Abbr"]], "context": row["Sentence"], "answer": [row["Definition"]], "meta": {}} diff --git a/seacrowd/sea_datasets/tydiqa/__init__.py b/seacrowd/sea_datasets/tydiqa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/tydiqa/tydiqa.py b/seacrowd/sea_datasets/tydiqa/tydiqa.py new file mode 100644 index 000000000..2379144e6 --- /dev/null +++ b/seacrowd/sea_datasets/tydiqa/tydiqa.py @@ -0,0 +1,436 @@ +import json + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = r"""\ +@article{clark-etal-2020-tydi, + title = "{T}y{D}i {QA}: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages", + author = "Clark, Jonathan H. and + Choi, Eunsol and + Collins, Michael and + Garrette, Dan and + Kwiatkowski, Tom and + Nikolaev, Vitaly and + Palomaki, Jennimaria", + editor = "Johnson, Mark and + Roark, Brian and + Nenkova, Ani", + journal = "Transactions of the Association for Computational Linguistics", + volume = "8", + year = "2020", + address = "Cambridge, MA", + publisher = "MIT Press", + url = "https://aclanthology.org/2020.tacl-1.30", + doi = "10.1162/tacl_a_00317", + pages = "454--470", + abstract = "Confidently making progress on multilingual modeling requires challenging, trustworthy evaluations. + We present TyDi QA{---}a question answering dataset covering 11 typologically diverse languages with 204K + question-answer pairs. The languages of TyDi QA are diverse with regard to their typology{---}the set of + linguistic features each language expresses{---}such that we expect models performing well on this set to + generalize across a large number of the world{'}s languages. We present a quantitative analysis of the data + quality and example-level qualitative linguistic analyses of observed language phenomena that would not be found + in English-only corpora. To provide a realistic information-seeking task and avoid priming effects, questions are + written by people who want to know the answer, but don{'}t know the answer yet, and the data is collected directly + in each language without the use of translation.", +} + +@inproceedings{cahyawijaya-etal-2021-indonlg, + title = "{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation", + author = "Cahyawijaya, Samuel and + Winata, Genta Indra and + Wilie, Bryan and + Vincentio, Karissa and + Li, Xiaohong and + Kuncoro, Adhiguna and + Ruder, Sebastian and + Lim, Zhi Yuan and + Bahar, Syafri and + Khodra, Masayu and + Purwarianti, Ayu and + Fung, Pascale", + booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", + month = nov, + year = "2021", + address = "Online and Punta Cana, Dominican Republic", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.emnlp-main.699", + doi = "10.18653/v1/2021.emnlp-main.699", + pages = "8875--8898" +} +""" + +_DATASETNAME = "tydiqa" + +_DESCRIPTION = """\ + TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs. + The languages of TyDi QA are diverse with regard to their typology -- the set of linguistic features that each language + expresses -- such that we expect models performing well on this set to generalize across a large number of the languages + in the world. It contains language phenomena that would not be found in English-only corpora. To provide a realistic + information-seeking task and avoid priming effects, questions are written by people who want to know the answer, but + don’t know the answer yet, (unlike SQuAD and its descendents) and the data is collected directly in each language + without the use of translation (unlike MLQA and XQuAD). + """ + +_HOMEPAGE = "https://github.com/google-research-datasets/tydiqa" +_LICENSE = Licenses.APACHE_2_0.value +_HF_URL = "https://huggingface.co/datasets/tydiqa" +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] +_LANGUAGES = ["ind", "tha"] +_LOCAL = False +_SOURCE_VERSION_P = "1.0.0" +_SOURCE_VERSION_S = "1.1.0" +_SEACROWD_VERSION = "1.0.0" + +_URL = "https://storage.googleapis.com/tydiqa/" +_PRIMARY_URLS = { + "train": _URL + "v1.0/tydiqa-v1.0-train.jsonl.gz", + "dev": _URL + "v1.0/tydiqa-v1.0-dev.jsonl.gz", +} +_SECONDARY_URLS = { + "train": _URL + "v1.1/tydiqa-goldp-v1.1-train.json", + "dev": _URL + "v1.1/tydiqa-goldp-v1.1-dev.json", +} + +_SELECTP_DESP = """Passage selection task (SelectP): Given a list of the passages in the article, return either (a) the index of + the passage that answers the question or (b) NULL if no such passage exists. + """ +_MINSPAN_DESP = """Minimal answer span task (MinSpan): Given the full text of an article, return one of (a) the start and end + byte indices of the minimal span that completely answers the question; (b) YES or NO if the question requires + a yes/no answer and we can draw a conclusion from the passage; (c) NULL if it is not possible to produce a + minimal answer for this question.""" +_GOLDP_DESP = """Gold passage task (GoldP): Given a passage that is guaranteed to contain the + answer, predict the single contiguous span of characters that answers the question. This is more similar to + existing reading comprehension datasets (as opposed to the information-seeking task outlined above). + """ +_ID_DESP = """{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation, is a benchmark + for evaluating Indonesian natural language generation (NLG) systems. The question-answer pairs are collected + for each language without using translation services. It uses the Indonesian data from the secondary Gold + passage task of the TyDiQA dataset. As the original dataset only provides training and validation sets, + TydiQA-ID randomly split off 15% of the training data and use it as the test set. + """ + + +def config_constructor(subset_id, schema, desc, version): + return SEACrowdConfig(name=f"{_DATASETNAME}_{subset_id}_{schema}", description=desc, version=datasets.Version(version), schema=schema, subset_id=subset_id) + + +class TydiqaDataset(datasets.GeneratorBasedBuilder): + """ + This is a main class of SEACrowd dataloader for TyDi QA, which is a question answering dataset covering 11 typologically + diverse languages with 204K question-answer pairs. The languages of TyDi QA are diverse with regard to their typology. + Here we also specially provide the split on the primary and secondary task for SEA language like indonesian and thai. + """ + + BUILDER_CONFIGS = [ + # source schema + # selectp source schema + config_constructor(subset_id="selectp", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="selectp_ind", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="selectp_tha", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), + # minspan source schema + config_constructor(subset_id="minspan", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="minspan_ind", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="minspan_tha", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), + # goldp source schema + config_constructor(subset_id="goldp", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), + config_constructor(subset_id="goldp_ind", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), + # tydiqa_id source schema + config_constructor(subset_id="id", schema="source", desc=_ID_DESP, version=_SOURCE_VERSION_P), + # seacrowd schema + # selectp seacrowd schema + config_constructor(subset_id="selectp", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="selectp_ind", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="selectp_tha", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), + # minspan seacrowd schema + config_constructor(subset_id="minspan", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="minspan_ind", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="minspan_tha", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), + # goldp seacrowd schema + config_constructor(subset_id="goldp", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="goldp_ind", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), + # tydiqa_id seacrowd schema + config_constructor(subset_id="id", schema="seacrowd_qa", desc=_ID_DESP, version=_SEACROWD_VERSION), + ] + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_id_source" + + def _info(self): + if ("selectp" in self.config.name) or ("minspan" in self.config.name): + if "source" in self.config.name: + features = datasets.Features( + { + "passage_answer_candidates": datasets.features.Sequence( + { + "plaintext_start_byte": datasets.Value("int32"), + "plaintext_end_byte": datasets.Value("int32"), + } + ), + "question_text": datasets.Value("string"), + "document_title": datasets.Value("string"), + "language": datasets.Value("string"), + "annotations": datasets.features.Sequence( + { + "passage_answer_candidate_index": datasets.Value("int32"), + "minimal_answers_start_byte": datasets.Value("int32"), + "minimal_answers_end_byte": datasets.Value("int32"), + "yes_no_answer": datasets.Value("string"), + } + ), + "document_plaintext": datasets.Value("string"), + "document_url": datasets.Value("string"), + } + ) + elif "seacrowd" in self.config.name: + features = schemas.qa_features + features["meta"] = { + "passage_answer_candidates": datasets.features.Sequence( + { + "plaintext_start_byte": datasets.Value("int32"), + "plaintext_end_byte": datasets.Value("int32"), + } + ), + "annotations": datasets.features.Sequence( + { + "passage_answer_candidate_index": datasets.Value("int32"), + "minimal_answers_start_byte": datasets.Value("int32"), + "minimal_answers_end_byte": datasets.Value("int32"), + "yes_no_answer": datasets.Value("string"), + } + ), + "language": datasets.Value("string"), + } + + elif ("goldp" in self.config.name) or ("tydiqa_id" in self.config.name): + if "source" in self.config.name: + features = datasets.Features( + { + "id": datasets.Value("string"), + "title": datasets.Value("string"), + "context": datasets.Value("string"), + "question": datasets.Value("string"), + "answers": datasets.features.Sequence( + { + "text": datasets.Value("string"), + "answer_start": datasets.Value("int32"), + } + ), + } + ) + elif "seacrowd" in self.config.name: + features = schemas.qa_features + features["meta"] = { + "answer_start": datasets.Sequence(datasets.Value("int32")), + } + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + citation=_CITATION, + homepage=_HOMEPAGE, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + primary_downloaded = dl_manager.download_and_extract(_PRIMARY_URLS) + secondary_downloaded = dl_manager.download_and_extract(_SECONDARY_URLS) + + if ("selectp" in self.config.name) or ("minspan" in self.config.name): + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": primary_downloaded["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": primary_downloaded["dev"]}, + ), + ] + + elif "goldp" in self.config.name: + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": secondary_downloaded["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": secondary_downloaded["dev"]}, + ), + ] + elif "tydiqa_id" in self.config.name: + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": secondary_downloaded["train"], "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": secondary_downloaded["train"], "split": "test"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": secondary_downloaded["dev"], "split": "validation"}, + ), + ] + + def _generate_examples(self, filepath, split=None): + """Yields examples.""" + + if ("selectp" in self.config.name) or ("minspan" in self.config.name): + with open(filepath, encoding="utf-8") as f: + for id_, row in enumerate(f): + data = json.loads(row) + passages = data["passage_answer_candidates"] + end_byte = [passage["plaintext_end_byte"] for passage in passages] + start_byte = [passage["plaintext_start_byte"] for passage in passages] + title = data["document_title"] + lang = data["language"] + question = data["question_text"] + annotations = data["annotations"] + yes_no_answers = [annotation["yes_no_answer"] for annotation in annotations] + min_answers_end_byte = [annotation["minimal_answer"]["plaintext_end_byte"] for annotation in annotations] + min_answers_start_byte = [annotation["minimal_answer"]["plaintext_start_byte"] for annotation in annotations] + passage_cand_answers = [annotation["passage_answer"]["candidate_index"] for annotation in annotations] + doc = data["document_plaintext"] + url = data["document_url"] + if (self.config.name == "tydiqa_selectp_source") or (self.config.name == "tydiqa_minspan_source"): + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) + elif (self.config.name == "tydiqa_selectp_ind_source") or (self.config.name == "tydiqa_minspan_ind_source"): + if lang == "indonesian": + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) + elif (self.config.name == "tydiqa_selectp_tha_source") or (self.config.name == "tydiqa_minspan_tha_source"): + if lang == "thai": + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) + # seacrowd + elif (self.config.name == "tydiqa_selectp_seacrowd_qa") or (self.config.name == "tydiqa_minspan_seacrowd_qa"): + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) + elif (self.config.name == "tydiqa_selectp_ind_seacrowd_qa") or (self.config.name == "tydiqa_minspan_ind_seacrowd_qa"): + if lang == "indonesian": + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) + elif (self.config.name == "tydiqa_selectp_tha_seacrowd_qa") or (self.config.name == "tydiqa_minspan_tha_seacrowd_qa"): + if lang == "thai": + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) + else: + raise ValueError(f"No configs to match {self.config.name} in primary_task") + + elif ("goldp" in self.config.name) or ("tydiqa_id" in self.config.name): + with (open(filepath, encoding="utf-8") as f): + data = json.load(f) + tydiqa_id_num = 0 + for article in data["data"]: + title = article.get("title", "").strip() + for paragraph in article["paragraphs"]: + context = paragraph["context"].strip() + for qa in paragraph["qas"]: + question = qa["question"].strip() + id_ = qa["id"] + answer_starts = [answer["answer_start"] for answer in qa["answers"]] + answers = [answer["text"].strip() for answer in qa["answers"]] + if self.config.name == "tydiqa_goldp_source": + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + + elif self.config.name == "tydiqa_goldp_ind_source": + if id_.startswith("indonesian"): + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + elif self.config.name == "tydiqa_id_source": + if id_.startswith("indonesian"): + tydiqa_id_num += 1 + if split == "train" and tydiqa_id_num >= 856: + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + if split == "test" and tydiqa_id_num < 856: + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + if split == "validation": + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + + elif self.config.name == "tydiqa_goldp_seacrowd_qa": + yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) + elif self.config.name == "tydiqa_goldp_ind_seacrowd_qa": + if id_.startswith("indonesian"): + yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) + elif self.config.name == "tydiqa_id_seacrowd_qa": + if id_.startswith("indonesian"): + tydiqa_id_num += 1 + if split == "train" and tydiqa_id_num >= 856: + yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) + if split == "test" and tydiqa_id_num < 856: + yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) + if split == "validation": + yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) + else: + raise ValueError(f"No configs to match {self.config.name} in secondary_task") + + +def primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url): + return { + "passage_answer_candidates": { + "plaintext_start_byte": start_byte, + "plaintext_end_byte": end_byte, + }, + "question_text": question, + "document_title": title, + "language": lang, + "annotations": { + "passage_answer_candidate_index": passage_cand_answers, + "minimal_answers_start_byte": min_answers_start_byte, + "minimal_answers_end_byte": min_answers_end_byte, + "yes_no_answer": yes_no_answers, + }, + "document_plaintext": doc, + "document_url": url, + } + + +def primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang): + return { + "id": str(id_), + "question_id": title, + "document_id": title, + "question": question, + "type": "multiple_choice", + "choices": [""], + "context": doc, + "answer": [""], + "meta": { + "passage_answer_candidates": { + "plaintext_start_byte": start_byte, + "plaintext_end_byte": end_byte, + }, + "annotations": { + "passage_answer_candidate_index": passage_cand_answers, + "minimal_answers_start_byte": min_answers_start_byte, + "minimal_answers_end_byte": min_answers_end_byte, + "yes_no_answer": yes_no_answers, + }, + "language": lang, + }, + } + + +def second_source_helper(id_, title, context, question, answer_starts, answers): + return { + "title": title, + "context": context, + "question": question, + "id": id_, + "answers": { + "answer_start": answer_starts, + "text": answers, + }, + } + + +def second_seacrowd_helper(id_, question, context, answers, answer_starts): + return { + "id": id_, + "question_id": id_, + "document_id": id_, + "question": question, + "type": "abstractive", + "choices": [], + "context": context, + "answer": answers, + "meta": {"answer_start": answer_starts}, + } diff --git a/seacrowd/sea_datasets/tydiqa_id/tydiqa_id.py b/seacrowd/sea_datasets/tydiqa_id/tydiqa_id.py deleted file mode 100644 index c11e95f94..000000000 --- a/seacrowd/sea_datasets/tydiqa_id/tydiqa_id.py +++ /dev/null @@ -1,187 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path -from typing import List - -import datasets -import json - -from seacrowd.utils import schemas -from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks - -_CITATION = """\ -@article{clark-etal-2020-tydi, - title = "{T}y{D}i {QA}: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages", - author = "Clark, Jonathan H. and - Choi, Eunsol and - Collins, Michael and - Garrette, Dan and - Kwiatkowski, Tom and - Nikolaev, Vitaly and - Palomaki, Jennimaria", - journal = "Transactions of the Association for Computational Linguistics", - volume = "8", - year = "2020", - address = "Cambridge, MA", - publisher = "MIT Press", - url = "https://aclanthology.org/2020.tacl-1.30", - doi = "10.1162/tacl_a_00317", - pages = "454--470", -} - -@inproceedings{cahyawijaya-etal-2021-indonlg, - title = "{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation", - author = "Cahyawijaya, Samuel and - Winata, Genta Indra and - Wilie, Bryan and - Vincentio, Karissa and - Li, Xiaohong and - Kuncoro, Adhiguna and - Ruder, Sebastian and - Lim, Zhi Yuan and - Bahar, Syafri and - Khodra, Masayu and - Purwarianti, Ayu and - Fung, Pascale", - booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", - month = nov, - year = "2021", - address = "Online and Punta Cana, Dominican Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.emnlp-main.699", - doi = "10.18653/v1/2021.emnlp-main.699", - pages = "8875--8898" -} -""" - -_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) -_LOCAL = False - -_DATASETNAME = "tydiqa_id" - -_DESCRIPTION = """\ -TyDiQA dataset is collected from Wikipedia articles with human-annotated question and answer pairs covering 11 languages. -The question-answer pairs are collected for each language without using translation services. -IndoNLG uses the Indonesian data from the secondary Gold passage task of the original TyDiQA dataset and -randomly split off 15% of the training data and use it as the test set. -""" - -_HOMEPAGE = "https://github.com/IndoNLP/indonlg" - -_LICENSE = "Creative Common Attribution Share-Alike 4.0 International" - -# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. -# In most cases the URLs will be the same for the source and seacrowd config. -# However, if you need to access different files for each config you can have multiple entries in this dict. -# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) -_URLS = { - _DATASETNAME: "https://storage.googleapis.com/babert-pretraining/IndoNLG_finals/downstream_task/downstream_task_datasets.zip" -} - -_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] - -_SOURCE_VERSION = "1.0.0" - -_SEACROWD_VERSION = "1.0.0" - - -class TyDiQAIdDataset(datasets.GeneratorBasedBuilder): - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - - BUILDER_CONFIGS = [ - SEACrowdConfig( - name="tydiqa_id_source", - version=SOURCE_VERSION, - description="TyDiQA Id source schema", - schema="source", - subset_id="tydiqa_id", - ), - SEACrowdConfig( - name="tydiqa_id_seacrowd_qa", - version=SEACROWD_VERSION, - description="TyDiQA Id Nusantara schema", - schema="seacrowd_qa", - subset_id="tydiqa_id", - ), - ] - - DEFAULT_CONFIG_NAME = "tydiqa_id_source" - - def _info(self) -> datasets.DatasetInfo: - - if self.config.schema == "source": - features = datasets.Features( - { - "id": datasets.Value("string"), - "context": datasets.Value("string"), - "question": datasets.Value("string"), - "label": datasets.Value("string") - } - ) - elif self.config.schema == "seacrowd_qa": - features = schemas.qa_features - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: - url = _URLS[_DATASETNAME] - base_path = Path(dl_manager.download_and_extract(url)) - train_data_path = base_path / "IndoNLG_downstream_tasks" / "question_answering" / "train_preprocess.json" - valid_data_path = base_path / "IndoNLG_downstream_tasks" / "question_answering" / "valid_preprocess.json" - test_data_path = base_path / "IndoNLG_downstream_tasks" / "question_answering" / "test_preprocess.json" - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": train_data_path}, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"filepath": valid_data_path}, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={"filepath": test_data_path}, - ) - ] - - def _generate_examples(self, filepath: Path): - if self.config.schema == "source": - for example in json.load(open(filepath, 'r')): - yield example["id"], example - elif self.config.schema == "seacrowd_qa": - for example in json.load(open(filepath, 'r')): - yield example["id"], { - "id": example['id'], - "question_id": example['id'], - "document_id": example['id'], - "question": example['question'], - "type": 'abstractive', - "choices": [], - "context": example['context'], - "answer": [example['label']], - "meta": {} - } - else: - raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py index f3a76d21e..e0b48d7ed 100644 --- a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py +++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py @@ -30,6 +30,9 @@ _HOMEPAGE = "https://github.com/imperialite/Philippine-Languages-Online-Corpora/tree/master/Tweets/Annotated%20Yolanda" +_LOCAL = False +_LANGUAGES = ["fil"] + _LICENSE = Licenses.CC_BY_4_0.value _ROOT_URL = "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/Tweets/Annotated%20Yolanda/" diff --git a/seacrowd/sea_datasets/ucla_phonetic/__init__.py b/seacrowd/sea_datasets/ucla_phonetic/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/ucla_phonetic/ucla_phonetic.py b/seacrowd/sea_datasets/ucla_phonetic/ucla_phonetic.py new file mode 100644 index 000000000..cb18a5e4f --- /dev/null +++ b/seacrowd/sea_datasets/ucla_phonetic/ucla_phonetic.py @@ -0,0 +1,158 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This dataset contains audio recordings and phonetic transcriptions of word utterances for various low-resource SEA languages. +Each language has a directory of text and audio files, with the latter forming one data subset. +The dataset is prepared from the online UCLA phonetic dataset, which contains 7000 utterances across 100 low-resource languages, phonetically aligned using various automatic approaches, and manually fixed for misalignments. +""" +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{li2021multilingual, + title={Multilingual phonetic dataset for low resource speech recognition}, + author={Li, Xinjian and Mortensen, David R and Metze, Florian and Black, Alan W}, + booktitle={ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + pages={6958--6962}, + year={2021}, + organization={IEEE} +} +""" + +_DATASETNAME = "ucla_phonetic" + +_DESCRIPTION = """\ +This dataset contains audio recordings and phonetic transcriptions of word utterances for various low-resource SEA languages. +Each language has a directory of text and audio files, with the latter forming one data subset. +The dataset is prepared from the online UCLA phonetic dataset, which contains 7000 utterances across 100 low-resource languages, phonetically aligned using various automatic approaches, and manually fixed for misalignments. +""" + +_HOMEPAGE = "https://github.com/xinjli/ucla-phonetic-corpus" + +_LANGUAGES = ["ace", "brv", "hil", "hni", "ilo", "khm", "mak", "mya", "pam"] + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value + +_LOCAL = False + +_DATA_URL = "https://github.com/xinjli/ucla-phonetic-corpus/releases/download/v1.0/data.tar.gz" + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +def seacrowd_config_constructor(lang, schema, version): + if lang not in _LANGUAGES: + raise ValueError(f"Invalid lang {lang}") + + if schema not in ["source", "seacrowd_sptext"]: + raise ValueError(f"Invalid schema: {schema}") + + return SEACrowdConfig( + name=f"ucla_phonetic_{lang}_{schema}", + version=datasets.Version(version), + description=f"UCLA Phonetic {schema} for {lang}", + schema=schema, + subset_id=f"{lang}_{schema}", + ) + + +class UCLAPhoneticDataset(datasets.GeneratorBasedBuilder): + """This dataset contains audio recordings and phonetic transcriptions of word utterances for various low-resource SEA languages.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + [ + SEACrowdConfig( + name="ucla_phonetic_source", + version=datasets.Version(_SOURCE_VERSION), + description="UCLA Phonetic source for ace", + schema="source", + subset_id="ace_source", + ), + SEACrowdConfig( + name="ucla_phonetic_seacrowd_sptext", + version=datasets.Version(_SOURCE_VERSION), + description="UCLA Phonetic seacrowd+sptext for ace", + schema="seacrowd_sptext", + subset_id="ace_seacrowd_sptext", + ), + ] + + [seacrowd_config_constructor(lang, "source", _SOURCE_VERSION) for lang in _LANGUAGES] + + [seacrowd_config_constructor(lang, "seacrowd_sptext", _SEACROWD_VERSION) for lang in _LANGUAGES] + ) + + DEFAULT_CONFIG_NAME = "ucla_phonetic_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000)}) + elif self.config.schema == "seacrowd_sptext": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + lang, schema = self.config.subset_id.split("_", maxsplit=1) + data_dir = dl_manager.download_and_extract(_DATA_URL) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "data", lang, "text.txt"), + "audiopath": Path(os.path.join(data_dir, "data", lang, "audio")), + }, + ) + ] + + def _generate_examples(self, filepath: Path, audiopath: Path) -> Tuple[int, Dict]: + + audiofiles = {} + for audiofile in audiopath.iterdir(): + audio_idx = os.path.basename(audiofile).split(".")[0] + audiofiles[audio_idx] = audiofile + + if self.config.schema == "source": + for line_idx, line in enumerate(open(filepath)): + audio_idx, text = line.strip().split(maxsplit=1) + yield line_idx, {"id": line_idx, "text": text, "audio": str(audiofiles[audio_idx])} + + elif self.config.schema == "seacrowd_sptext": + for line_idx, line in enumerate(open(filepath)): + audio_idx, text = line.strip().split(maxsplit=1) + yield line_idx, {"id": line_idx, "path": str(audiofiles[audio_idx]), "audio": str(audiofiles[audio_idx]), "text": text, "speaker_id": None, "metadata": {"speaker_age": None, "speaker_gender": None}} diff --git a/seacrowd/sea_datasets/ud_jv_csui/__init__.py b/seacrowd/sea_datasets/ud_jv_csui/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/ud_jv_csui/ud_jv_csui.py b/seacrowd/sea_datasets/ud_jv_csui/ud_jv_csui.py new file mode 100644 index 000000000..dfc29748e --- /dev/null +++ b/seacrowd/sea_datasets/ud_jv_csui/ud_jv_csui.py @@ -0,0 +1,256 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@unpublished{Alfina2023, + author = {Alfina, Ika and Yuliawati, Arlisa and Tanaya, Dipta and Dinakaramani, Arawinda and Zeman, Daniel}, + title = {{A Gold Standard Dataset for Javanese Tokenization, POS Tagging, Morphological Feature Tagging, and Dependency Parsing}}, + year = {2023} +} +""" + +_DATASETNAME = "ud_jv_csui" + +_DESCRIPTION = """\ +UD Javanese-CSUI is a dependency treebank in Javanese, a regional language in Indonesia with more than 68 million users. +It was developed by Alfina et al. from the Faculty of Computer Science, Universitas Indonesia. +The newest version has 1000 sentences and 14K words with manual annotation. + +The sentences use the Latin script and do not use the original writing system of Javanese (Hanacaraka). + +The original sentences were taken from several resources: +1. Javanese reference grammar books (125 sents) +2. OPUS, especially from the Javanese section of the WikiMatrix v1 corpus (150 sents) +3. Online news (Solopos) (725 sents) + +Javanese has several language levels (register), such as Ngoko, Krama, Krama Inggil, and Krama Andhap. +In this treebank, the sentences predominantly use Ngoko words, some of which use Krama words. +""" + +_HOMEPAGE = "https://github.com/UniversalDependencies/UD_Javanese-CSUI" + +_LANGUAGES = ["jav"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/UniversalDependencies/UD_Javanese-CSUI/master/jv_csui-ud-test.conllu", +} + +_SUPPORTED_TASKS = [Tasks.DEPENDENCY_PARSING, Tasks.MACHINE_TRANSLATION, Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +def _resolve_misannotation_(dataset): + """Resolving mis-annotation in the raw data. In-place.""" + for d in dataset: + # Metadata's typos + if d["sent_id"] == "opus-wiki-5": # From the raw file. Thrown-away during parsing due to no field name. + d.setdefault("text_en", "Prior to World War II, 14 commercial and 12 public radios could be operated in France.") + if d["sent_id"] == "wedhawati-2001-66": # empty string + d.setdefault("text_en", "Reading can expand knowledge.") + if d["sent_id"] == "opus-wiki-72": + d["text_en"] = d.pop("text-en") # metadata mis-titled + if d["sent_id"] == "opus-wiki-27": + d["text_id"] = d.pop("tex_id") # metadata mis-titled + + # Problems on the annotation itself + if d["sent_id"] == "solopos-2022-42": # POS tag is also wrong. Proceed with caution. + d["form"][1] = d["form"][1].replace("tresnane", "tresna") # tresna + e + if d["sent_id"] == "solopos-2022-93": # wrong annot + d["form"][10] = d["form"][10].replace("tengene", "tengen") # tengen + e + if d["sent_id"] == "solopos-2022-506": # annotation inconsistency on occurrences of word "sedina" + d["form"][3] = d["form"][3].replace("siji", "se") + if d["sent_id"] == "solopos-2022-711": # annotation inconsistency on the word "rasah" from "ra" and "usah" + d["form"][11] = d["form"][11].replace("usah", "sah") + + return dataset + + +class UdJvCsuiDataset(datasets.GeneratorBasedBuilder): + """Treebank of Javanese comprises 1030 sentences from 14K words with manual annotation""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + # source: https://universaldependencies.org/u/pos/ + UPOS_TAGS = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_kb", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd KB schema", + schema="seacrowd_kb", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_t2t", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd Text-to-Text schema", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + # metadata + "sent_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_id": datasets.Value("string"), + "text_en": datasets.Value("string"), + # tokens + "id": [datasets.Value("string")], + "form": [datasets.Value("string")], + "lemma": [datasets.Value("string")], + "upos": [datasets.Value("string")], + "xpos": [datasets.Value("string")], + "feats": [datasets.Value("string")], + "head": [datasets.Value("string")], + "deprel": [datasets.Value("string")], + "deps": [datasets.Value("string")], + "misc": [datasets.Value("string")], + } + ) + elif self.config.schema == "seacrowd_kb": + features = schemas.kb_features + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(self.UPOS_TAGS) + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_path = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, # https://github.com/UniversalDependencies/UD_Javanese-CSUI#split + gen_kwargs={"filepath": data_path}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + # Note from hudi_f: + # Other than 3 sentences with multi-span of length 3, the data format seems fine. + # Thus, it is safe to ignore the assertion. (as of 2024/02/14) + dataset = list( + load_ud_data( + filepath, + filter_kwargs={"id": lambda i: isinstance(i, int)}, + # assert_fn=assert_multispan_range_is_one + ) + ) + _resolve_misannotation_(dataset) + + for d in dataset: + if "text_id" not in d or "text_en" not in d: + print(d) + + if self.config.schema == "source": + pass + + elif self.config.schema == "seacrowd_kb": + dataset = load_ud_data_as_seacrowd_kb( + filepath, + dataset, + morph_exceptions=[ + # Exceptions due to inconsistencies in the raw data annotation + ("ne", "e"), + ("nipun", "ipun"), + ("me", "e"), # occurrence word: "Esemme" = "Esem" + "e". original text has double 'm'. + ], + ) + + elif self.config.schema == "seacrowd_t2t": + dataset = list( + map( + lambda d: { + "id": d["sent_id"], + "text_1": d["text"], + "text_2": d["text_id"], + "text_1_name": "jav", + "text_2_name": "ind", + }, + dataset, + ) + ) + + elif self.config.schema == "seacrowd_seq_label": + dataset = list( + map( + lambda d: { + "id": d["sent_id"], + "tokens": d["form"], + "labels": d["upos"], + }, + dataset, + ) + ) + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + for key, example in enumerate(dataset): + yield key, example diff --git a/seacrowd/sea_datasets/udhr/udhr.py b/seacrowd/sea_datasets/udhr/udhr.py index bb4445da6..dd0a2dcd2 100644 --- a/seacrowd/sea_datasets/udhr/udhr.py +++ b/seacrowd/sea_datasets/udhr/udhr.py @@ -72,6 +72,9 @@ "zlm": "Malay", # default mly_latn } +_LOCAL=False +_LANGUAGES=["ace", "ban", "bcl", "blt", "bug", "ceb", "cfm", "cnh", "ctd", "duu", "hil", "hlt", "hni", "hnj", "ilo", "ind", "jav", "khm", "kkh", "lao", "lus", "mad", "min", "mnw", "mya", "pam", "shn", "sun", "tdt", "tet", "tgl", "tha", "vie", "war", "zlm"] + def seacrowd_config_constructor(src_lang, schema, version): if src_lang == "": raise ValueError(f"Invalid src_lang {src_lang}") diff --git a/seacrowd/sea_datasets/uit_vicov19qa/__init__.py b/seacrowd/sea_datasets/uit_vicov19qa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_vicov19qa/uit_vicov19qa.py b/seacrowd/sea_datasets/uit_vicov19qa/uit_vicov19qa.py new file mode 100644 index 000000000..936e603aa --- /dev/null +++ b/seacrowd/sea_datasets/uit_vicov19qa/uit_vicov19qa.py @@ -0,0 +1,167 @@ +# coding=utf-8 +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{thai-etal-2022-uit, +title = "{UIT}-{V}i{C}o{V}19{QA}: A Dataset for {COVID}-19 Community-based Question Answering on {V}ietnamese Language", +author = "Thai, Triet and Thao-Ha, Ngan Chu and Vo, Anh and Luu, Son", +editor = "Dita, Shirley and Trillanes, Arlene and Lucas, Rochelle Irene", +booktitle = "Proceedings of the 36th Pacific Asia Conference on Language, Information and Computation", +month = oct, +year = "2022", +address = "Manila, Philippines", +publisher = "Association for Computational Linguistics", +url = "https://aclanthology.org/2022.paclic-1.88", +pages = "801--810", +} +""" +_DATASETNAME = "uit_vicov19qa" +_DESCRIPTION = """\ +UIT-ViCoV19QA is the first Vietnamese community-based question answering dataset for developing question answering +systems for COVID-19. The dataset comprises 4,500 question-answer pairs collected from trusted medical sources, +with at least one answer and at most four unique paraphrased answers per question. This dataset contains 1800 questions +that have at least two answers, 700 questions have at least three answers and half of them have a maximum of four paraphrased +answers. +""" +_HOMEPAGE = "https://github.com/triet2397/UIT-ViCoV19QA" +_LANGUAGES = ["vie"] +_LICENSE = Licenses.UNKNOWN.value +_PAPER_URL = "https://aclanthology.org/2022.paclic-1.88" +_LOCAL = False +_URLS = { + "train": { + "1_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/1_ans/UIT-ViCoV19QA_train.csv", + "2_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/2_ans/UIT-ViCoV19QA_train.csv", + "3_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/3_ans/UIT-ViCoV19QA_train.csv", + "4_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/4_ans/UIT-ViCoV19QA_train.csv", + }, + "val": { + "1_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/1_ans/UIT-ViCoV19QA_val.csv", + "2_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/2_ans/UIT-ViCoV19QA_val.csv", + "3_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/3_ans/UIT-ViCoV19QA_val.csv", + "4_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/4_ans/UIT-ViCoV19QA_val.csv", + }, + "test": { + "1_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/1_ans/UIT-ViCoV19QA_test.csv", + "2_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/2_ans/UIT-ViCoV19QA_test.csv", + "3_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/3_ans/UIT-ViCoV19QA_test.csv", + "4_ans": "https://raw.githubusercontent.com/triet2397/UIT-ViCoV19QA/main/dataset/4_ans/UIT-ViCoV19QA_test.csv", + }, +} +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ViHealthQADataset(datasets.GeneratorBasedBuilder): + """ + This is a SeaCrowed dataloader for dataset uit_vicov19qa, The dataset comprises 4,500 question-answer pairs collected from trusted medical sources, + with at least one answer and at most four unique paraphrased answers per question. + """ + + subsets = ["1_ans", "2_ans", "3_ans", "4_ans"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", subset_id=f"{_DATASETNAME}"), + + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_qa", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{_DATASETNAME}", + ) + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "question": datasets.Value("string"), + "answers": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + else: + raise ValueError(f"No schema matched for {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + data_dir = dl_manager.download_and_extract(_URLS) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir["train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir["val"], + "split": "val", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir["test"], + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Dict, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + print(f"Generating examples for split {split}") + sample_id = -1 + for path in filepath.values(): + raw_examples = pd.read_csv(path, na_filter=False, delimiter="|") + for eid, exam in raw_examples.iterrows(): + sample_id += 1 + exam_id = exam[0] + exam_quest = exam[1] + exam_answers = exam[2:].values + if self.config.schema == "source": + yield sample_id, {"id": str(exam_id), + "question": exam_quest, + "answers": exam_answers + } + + elif self.config.schema == "seacrowd_qa": + yield sample_id, {"id": str(sample_id), + "question_id": exam_id, + "document_id": str(sample_id), + "question": exam_quest, + "type": None, + "choices": [], + "context": None, + "answer": exam_answers, + "meta": {} + } diff --git a/seacrowd/sea_datasets/uit_victsd/__init__.py b/seacrowd/sea_datasets/uit_victsd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_victsd/uit_victsd.py b/seacrowd/sea_datasets/uit_victsd/uit_victsd.py new file mode 100644 index 000000000..072d29ea9 --- /dev/null +++ b/seacrowd/sea_datasets/uit_victsd/uit_victsd.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@inproceedings{, + author = {Nguyen, Luan Thanh and Van Nguyen, Kiet and Nguyen, Ngan Luu-Thuy}, + title = {Constructive and Toxic Speech Detection for Open-domain Social Media Comments in Vietnamese}, + booktitle = {Advances and Trends in Artificial Intelligence. Artificial Intelligence Practices}, + year = {2021}, + publisher = {Springer International Publishing}, + address = {Kuala Lumpur, Malaysia}, + pages = {572--583}, +} +""" + +_LOCAL = False +_LANGUAGES = ["vie"] +_DATASETNAME = "uit_victsd" +_DESCRIPTION = """ +The UIT-ViCTSD (Vietnamese Constructive and Toxic Speech Detection dataset) is a compilation of 10,000 human-annotated +comments intended for constructive and toxic comments detection. The dataset spans 10 domains, reflecting the diverse topics +and expressions found in social media interactions among Vietnamese users. +""" + +_HOMEPAGE = "https://github.com/tarudesu/ViCTSD" +_LICENSE = Licenses.UNKNOWN.value +_URL = "https://huggingface.co/datasets/tarudesu/ViCTSD" + + +_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION, Tasks.ABUSIVE_LANGUAGE_PREDICTION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class UiTViCTSDDataset(datasets.GeneratorBasedBuilder): + """ + Dataset of Vietnamese social media comments annotated + for constructiveness and toxicity. + """ + + SUBSETS = ["constructiveness", "toxicity"] + CLASS_LABELS = [0, 1] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema for {subset} subset", + schema="source", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SUBSETS + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema for {subset} subset", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SUBSETS + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_constructiveness_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "Unnamed: 0": datasets.Value("int64"), # Column name missing in original dataset + "Comment": datasets.Value("string"), + "Constructiveness": datasets.ClassLabel(names=self.CLASS_LABELS), + "Toxicity": datasets.ClassLabel(names=self.CLASS_LABELS), + "Title": datasets.Value("string"), + "Topic": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(label_names=self.CLASS_LABELS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + # dl_manager not used since dataloader uses HF 'load_dataset' + return [datasets.SplitGenerator(name=split, gen_kwargs={"split": split._name}) for split in (datasets.Split.TRAIN, datasets.Split.VALIDATION, datasets.Split.TEST)] + + def _load_hf_data_from_remote(self, split: str) -> datasets.DatasetDict: + """Load dataset from HuggingFace.""" + HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) + _hf_dataset_source = datasets.load_dataset(HF_REMOTE_REF, split=split) + return _hf_dataset_source + + def _generate_examples(self, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data = self._load_hf_data_from_remote(split=split) + for index, row in enumerate(data): + if self.config.schema == "source": + example = row + + elif self.config.schema == "seacrowd_text": + if "constructiveness" in self.config.name: + label = row["Constructiveness"] + elif "toxicity" in self.config.name: + label = row["Toxicity"] + example = {"id": str(index), "text": row["Comment"], "label": label} + yield index, example diff --git a/seacrowd/sea_datasets/uit_vihsd/__init__.py b/seacrowd/sea_datasets/uit_vihsd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_vihsd/uit_vihsd.py b/seacrowd/sea_datasets/uit_vihsd/uit_vihsd.py new file mode 100644 index 000000000..235413eaa --- /dev/null +++ b/seacrowd/sea_datasets/uit_vihsd/uit_vihsd.py @@ -0,0 +1,145 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Dict, List, Tuple +import datasets +import pandas + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks +_CITATION = """ +@InProceedings{10.1007/978-3-030-79457-6_35, +author="Luu, Son T. +and Nguyen, Kiet Van +and Nguyen, Ngan Luu-Thuy", +editor="Fujita, Hamido +and Selamat, Ali +and Lin, Jerry Chun-Wei +and Ali, Moonis", +title="A Large-Scale Dataset for Hate Speech Detection on Vietnamese Social Media Texts", +booktitle="Advances and Trends in Artificial Intelligence. Artificial Intelligence Practices", +year="2021", +publisher="Springer International Publishing", +address="Cham", +pages="415--426", +abstract="In recent years, Vietnam witnesses the mass development of social network users on different social +platforms such as Facebook, Youtube, Instagram, and Tiktok. On social media, hate speech has become a critical +problem for social network users. To solve this problem, we introduce the ViHSD - a human-annotated dataset for +automatically detecting hate speech on the social network. This dataset contains over 30,000 comments, each comment +in the dataset has one of three labels: CLEAN, OFFENSIVE, or HATE. Besides, we introduce the data creation process +for annotating and evaluating the quality of the dataset. Finally, we evaluate the dataset by deep learning and transformer models.", +isbn="978-3-030-79457-6" +} +""" + +_LOCAL = False +_LANGUAGES = ["vie"] +_DATASETNAME = "uit_vihsd" +_DESCRIPTION = """ +The ViHSD dataset consists of comments collected from Facebook pages and YouTube channels that have a +high-interactive rate, and do not restrict comments. This dataset is used for hate speech detection on +Vietnamese language. Data is anonymized, and labeled as either HATE, OFFENSIVE, or CLEAN. +""" + +_HOMEPAGE = "https://github.com/sonlam1102/vihsd/" +_LICENSE = Licenses.UNKNOWN.value +_URL = "https://raw.githubusercontent.com/sonlam1102/vihsd/main/data/vihsd.zip" + +_Split_Path = { + "train": "vihsd/train.csv", + "validation": "vihsd/dev.csv", + "test": "vihsd/test.csv", +} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class UiTVihsdDataset(datasets.GeneratorBasedBuilder): + """ + The SeaCrowd dataloader for the dataset Vietnamese Hate Speech Detection (UIT-ViHSD). + """ + + CLASS_LABELS = ["CLEAN", "OFFENSIVE", "HATE"] # 0:CLEAN, 1:OFFENSIVE, 2:HATE + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema ", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("int64"), + "text": datasets.Value("string"), + "label": datasets.Value("string"), + } + ) + + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(label_names=self.CLASS_LABELS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + file_paths = dl_manager.download_and_extract(_URL) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["train"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["validation"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["test"])}, + ), + ] + + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data_lines = pandas.read_csv(filepath) + for row in data_lines.itertuples(): + if self.config.schema == "source": + example = {"id": str(row.Index), "text": row.free_text, "label": row.label_id} + if self.config.schema == "seacrowd_text": + example = {"id": str(row.Index), "text": row.free_text, "label": self.CLASS_LABELS[int(row.label_id)]} + yield row.Index, example + diff --git a/seacrowd/sea_datasets/uit_viic/__init__.py b/seacrowd/sea_datasets/uit_viic/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_viic/uit_viic.py b/seacrowd/sea_datasets/uit_viic/uit_viic.py new file mode 100644 index 000000000..80712f4ac --- /dev/null +++ b/seacrowd/sea_datasets/uit_viic/uit_viic.py @@ -0,0 +1,150 @@ +# coding=utf-8 +import json +import os.path + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_DATASETNAME = "uit_viic" +_CITATION = """\ +@InProceedings{10.1007/978-3-030-63007-2_57, +author="Lam, Quan Hoang +and Le, Quang Duy +and Nguyen, Van Kiet +and Nguyen, Ngan Luu-Thuy", +editor="Nguyen, Ngoc Thanh +and Hoang, Bao Hung +and Huynh, Cong Phap +and Hwang, Dosam +and Trawi{\'{n}}ski, Bogdan +and Vossen, Gottfried", +title="UIT-ViIC: A Dataset for the First Evaluation on Vietnamese Image Captioning", +booktitle="Computational Collective Intelligence", +year="2020", +publisher="Springer International Publishing", +address="Cham", +pages="730--742", +abstract="Image Captioning (IC), the task of automatic generation of image captions, has attracted +attentions from researchers in many fields of computer science, being computer vision, natural language +processing and machine learning in recent years. This paper contributes to research on Image Captioning +task in terms of extending dataset to a different language - Vietnamese. So far, there has been no existed +Image Captioning dataset for Vietnamese language, so this is the foremost fundamental step for developing +Vietnamese Image Captioning. In this scope, we first built a dataset which contains manually written +captions for images from Microsoft COCO dataset relating to sports played with balls, we called this dataset +UIT-ViIC (University Of Information Technology - Vietnamese Image Captions). UIT-ViIC consists of 19,250 +Vietnamese captions for 3,850 images. Following that, we evaluated our dataset on deep neural network models +and did comparisons with English dataset and two Vietnamese datasets built by different methods. UIT-ViIC +is published on our lab website (https://sites.google.com/uit.edu.vn/uit-nlp/) for research purposes.", +isbn="978-3-030-63007-2" +} +""" + +_DESCRIPTION = """ +UIT-ViIC contains manually written captions for images from Microsoft COCO dataset relating to sports +played with ball. UIT-ViIC consists of 19,250 Vietnamese captions for 3,850 images. For each image, +UIT-ViIC provides five Vietnamese captions annotated by five annotators. +""" + +_HOMEPAGE = "https://drive.google.com/file/d/1YexKrE6o0UiJhFWpE8M5LKoe6-k3AiM4" +_PAPER_URL = "https://arxiv.org/abs/2002.00175" +_LICENSE = Licenses.UNKNOWN.value +_HF_URL = "" +_LANGUAGES = ["vi"] +_LOCAL = False +_SUPPORTED_TASKS = [Tasks.IMAGE_CAPTIONING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +_URLS = "https://drive.google.com/uc?export=download&id=1YexKrE6o0UiJhFWpE8M5LKoe6-k3AiM4" +_Split_Path = { + "train": "UIT-ViIC/uitviic_captions_train2017.json", + "validation": "UIT-ViIC/uitviic_captions_val2017.json", + "test": "UIT-ViIC/uitviic_captions_test2017.json", +} + + +class UITViICDataset(datasets.GeneratorBasedBuilder): + BUILDER_CONFIGS = [ + SEACrowdConfig(name=f"{_DATASETNAME}_source", version=datasets.Version(_SOURCE_VERSION), description=_DESCRIPTION, subset_id=f"{_DATASETNAME}", schema="source"), + SEACrowdConfig(name=f"{_DATASETNAME}_seacrowd_imtext", version=datasets.Version(_SEACROWD_VERSION), description=_DESCRIPTION, subset_id=f"{_DATASETNAME}", schema="seacrowd_imtext"), + ] + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + "license": datasets.Value("int32"), + "file_name": datasets.Value("string"), + "coco_url": datasets.Value("string"), + "flickr_url": datasets.Value("string"), + "height": datasets.Value("int32"), + "width": datasets.Value("int32"), + "date_captured": datasets.Value("string"), + "image_id": datasets.Value("int32"), + "caption": datasets.Value("string"), + "cap_id": datasets.Value("int32"), + } + ) + elif self.config.schema == "seacrowd_imtext": + features = schemas.image_text_features() + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + license=_LICENSE, + homepage=_HOMEPAGE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + file_paths = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["train"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["validation"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["test"])}, + ), + ] + + def _generate_examples(self, filepath): + """Yields examples.""" + with open(filepath, encoding="utf-8") as f: + json_dict = json.load(f) + images = {itm["id"]: itm for itm in json_dict["images"]} + captns = json_dict["annotations"] + + for idx, capt in enumerate(captns): + image_id = capt["image_id"] + if self.config.schema == "source": + yield idx, { + "license": images[image_id]["license"], + "file_name": images[image_id]["file_name"], + "coco_url": images[image_id]["coco_url"], + "flickr_url": images[image_id]["flickr_url"], + "height": images[image_id]["height"], + "width": images[image_id]["width"], + "date_captured": images[image_id]["date_captured"], + "image_id": capt["image_id"], + "caption": capt["caption"], + "cap_id": capt["id"], + } + elif self.config.schema == "seacrowd_imtext": + yield idx, { + "id": capt["id"], + "image_paths": [images[image_id]["coco_url"], images[image_id]["flickr_url"]], + "texts": capt["caption"], + "metadata": { + "context": "", + "labels": ["Yes"], + }, + } diff --git a/seacrowd/sea_datasets/uit_viocd/__init__.py b/seacrowd/sea_datasets/uit_viocd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_viocd/uit_viocd.py b/seacrowd/sea_datasets/uit_viocd/uit_viocd.py new file mode 100644 index 000000000..d6bc930b9 --- /dev/null +++ b/seacrowd/sea_datasets/uit_viocd/uit_viocd.py @@ -0,0 +1,141 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@incollection{nguyen2021vietnamese, + title={Vietnamese Complaint Detection on E-Commerce Websites}, + author={Nguyen, Nhung Thi-Hong and Ha, Phuong Phan-Dieu and Nguyen, Luan Thanh and Nguyen, Kiet Van and Nguyen, Ngan Luu-Thuy}, + booktitle={New Trends in Intelligent Software Methodologies, Tools and Techniques}, + pages={618--629}, + year={2021}, + publisher={IOS Press} +} +""" + +_DATASETNAME = "uit_viocd" + +_DESCRIPTION = """\ +The UIT-ViOCD dataset includes 5,485 reviews e-commerce sites across four categories: fashion, cosmetics, applications, +and phones. Each review is annotated by humans, assigning a label of 1 for complaints and 0 for non-complaints. +The dataset is divided into training, validation, and test sets, distributed approximately in an 80:10:10 ratio. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/tarudesu/ViOCD" + +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + "train": "https://huggingface.co/datasets/tarudesu/ViOCD/resolve/main/train.csv?download=true", + "val": "https://huggingface.co/datasets/tarudesu/ViOCD/resolve/main/val.csv?download=true", + "test": "https://huggingface.co/datasets/tarudesu/ViOCD/resolve/main/test.csv?download=true", +} + +_SUPPORTED_TASKS = [Tasks.COMPLAINT_DETECTION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class UITVIOCDDataset(datasets.GeneratorBasedBuilder): + """The UIT-ViOCD dataset includes 5,485 reviews e-commerce sites across four categories: fashion, cosmetics, applications, and phones.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + LABEL_CLASSES = [1, 0] + + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "review": datasets.Value("string"), + "review_tokenize": datasets.Value("string"), + "label": datasets.ClassLabel(names=self.LABEL_CLASSES), + "domain": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(self.LABEL_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + data_dir = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir["train"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir["test"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir["val"], + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + df = pd.read_csv(filepath) + + if self.config.schema == "source": + for key, example in df.iterrows(): + yield key, { + "review": example["review"], + "review_tokenize": example["review_tokenize"], + "label": example["label"], + "domain": example["domain"], + } + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for key, example in df.iterrows(): + yield key, {"id": str(key), "text": str(example["review"]), "label": int(example["label"])} diff --git a/seacrowd/sea_datasets/uit_vion/__init__.py b/seacrowd/sea_datasets/uit_vion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_vion/uit_vion.py b/seacrowd/sea_datasets/uit_vion/uit_vion.py new file mode 100644 index 000000000..32d6eed9e --- /dev/null +++ b/seacrowd/sea_datasets/uit_vion/uit_vion.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{fujita2021empirical, + title={An Empirical Investigation of Online News Classification on an Open-Domain, Large-Scale and High-Quality Dataset in Vietnamese}, + author={Fujita, H and Perez-Meana, H}, + booktitle={New Trends in Intelligent Software Methodologies, Tools and Techniques: Proceedings of the 20th International Conference on New Trends in Intelligent Software Methodologies, Tools and Techniques (SoMeT_21)}, + volume={337}, + pages={367}, + year={2021}, + organization={IOS Press} +} +""" + +_DATASETNAME = "uit_vion" + + +_DESCRIPTION = """\ +UIT-ViON (Vietnamese Online Newspaper) is a dataset collected from well-known online newspapers in Vietnamese. +The UIT-ViON is an open-domain, large-scale, and high-quality dataset consisting of 260,000 textual data +points annotated with 13 different categories for evaluating Vietnamese short text classification. +The dataset is split into training, validation, and test sets, each containing 208000, 26000, +and 26000 pieces of text, respectively. +""" + +_HOMEPAGE = "https://github.com/kh4nh12/UIT-ViON-Dataset" + +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://github.com/kh4nh12/UIT-ViON-Dataset/archive/refs/heads/master.zip", +} + +_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class UitVion(datasets.GeneratorBasedBuilder): + """UIT-ViON (Vietnamese Online Newspaper) is a dataset collected from well-known online newspapers in Vietnamese.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + LABEL_CLASSES = [i for i in range(13)] + + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "title": datasets.Value("string"), + "link": datasets.Value("string"), + "label": datasets.ClassLabel(names=self.LABEL_CLASSES), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(self.LABEL_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + file_dir = os.path.join("UIT-ViON-Dataset-main", "data.zip") + data_dir = os.path.join(data_dir, file_dir) + data_dir = dl_manager.download_and_extract(data_dir) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "UIT-ViON_train.csv"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "UIT-ViON_test.csv"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "UIT-ViON_dev.csv"), + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data = pd.read_csv(filepath) + + if self.config.schema == "source": + for i, row in data.iterrows(): + yield i, { + "title": str(row["title"]), + "link": str(row["link"]), + "label": row["label"], + } + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + for i, row in data.iterrows(): + yield i, { + "id": str(i), + "text": str(row["title"]), + "label": int(row["label"]), + } diff --git a/seacrowd/sea_datasets/uit_vsfc/__init__.py b/seacrowd/sea_datasets/uit_vsfc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_vsfc/uit_vsfc.py b/seacrowd/sea_datasets/uit_vsfc/uit_vsfc.py new file mode 100644 index 000000000..cd849078a --- /dev/null +++ b/seacrowd/sea_datasets/uit_vsfc/uit_vsfc.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{van2018uit, + title={UIT-VSFC: Vietnamese students’ feedback corpus for sentiment analysis}, + author={Van Nguyen, Kiet and Nguyen, Vu Duc and Nguyen, Phu XV and Truong, Tham TH and Nguyen, Ngan Luu-Thuy}, + booktitle={2018 10th international conference on knowledge and systems engineering (KSE)}, + pages={19--24}, + year={2018}, + organization={IEEE} +} +""" + + +_DATASETNAME = "uit_vsfc" + +_DESCRIPTION = """\ +This corpus consists of student feedback obtained from end-of-semester surveys at a Vietnamese university. +Feedback is classified into four possible topics: lecturer, curriculum, facility or others. +Feedback is also labeled as one of three sentiment polarities: positive, negative or neutral. +""" + +_HOMEPAGE = "https://drive.google.com/drive/folders/1HooABJyrddVGzll7fgkJ6VzkG_XuWfRu" + +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = False + + +_URLS = { + "train": { + "sentences": "https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP&export=download", + "sentiments": "https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv&export=download", + "topics": "https://drive.google.com/uc?id=14MuDtwMnNOcr4z_8KdpxprjbwaQ7lJ_C&export=download", + }, + "validation": { + "sentences": "https://drive.google.com/uc?id=1sMJSR3oRfPc3fe1gK-V3W5F24tov_517&export=download", + "sentiments": "https://drive.google.com/uc?id=1GiY1AOp41dLXIIkgES4422AuDwmbUseL&export=download", + "topics": "https://drive.google.com/uc?id=1DwLgDEaFWQe8mOd7EpF-xqMEbDLfdT-W&export=download", + }, + "test": { + "sentences": "https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n&export=download", + "sentiments": "https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO&export=download", + "topics": "https://drive.google.com/uc?id=1_ArMpDguVsbUGl-xSMkTF_p5KpZrmpSB&export=download", + }, +} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS, Tasks.TOPIC_MODELING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class UITVSFCDataset(datasets.GeneratorBasedBuilder): + """This corpus consists of student feedback obtained from end-of-semester surveys at a Vietnamese university. + Feedback is classified into four possible topics: lecturer, curriculum, facility or others. + Feedback is also labeled as one of three sentiment polarities: positive, negative or neutral.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SENTIMENT_LABEL_CLASSES = ["positive", "negative", "neutral"] + TOPIC_LABEL_CLASSES = ["lecturer", "training_program", "others", "facility"] + + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_sentiment_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_topic_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_sentiment_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_topic_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "sentence": datasets.Value("string"), + "sentiment": datasets.ClassLabel(names=self.SENTIMENT_LABEL_CLASSES), + "topic": datasets.ClassLabel(names=self.TOPIC_LABEL_CLASSES), + } + ) + elif self.config.name == f"{_DATASETNAME}_sentiment_seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(self.SENTIMENT_LABEL_CLASSES) + elif self.config.name == f"{_DATASETNAME}_topic_seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(self.TOPIC_LABEL_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + data_dir = dl_manager.download(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "sentences_path": data_dir["train"]["sentences"], + "sentiments_path": data_dir["train"]["sentiments"], + "topics_path": data_dir["train"]["topics"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "sentences_path": data_dir["test"]["sentences"], + "sentiments_path": data_dir["test"]["sentiments"], + "topics_path": data_dir["test"]["topics"], + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "sentences_path": data_dir["validation"]["sentences"], + "sentiments_path": data_dir["validation"]["sentiments"], + "topics_path": data_dir["validation"]["topics"], + "split": "dev", + }, + ), + ] + + def _generate_examples(self, sentences_path: Path, sentiments_path: Path, topics_path: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + with open(sentences_path, encoding="utf-8") as sentences, open(sentiments_path, encoding="utf-8") as sentiments, open(topics_path, encoding="utf-8") as topics: + for key, (sentence, sentiment, topic) in enumerate(zip(sentences, sentiments, topics)): + yield key, { + "sentence": sentence.strip(), + "sentiment": int(sentiment.strip()), + "topic": int(topic.strip()), + } + + elif self.config.name == f"{_DATASETNAME}_sentiment_seacrowd_{self.SEACROWD_SCHEMA_NAME}": + with open(sentences_path, encoding="utf-8") as sentences, open(sentiments_path, encoding="utf-8") as sentiments: + for key, (sentence, sentiment) in enumerate(zip(sentences, sentiments)): + yield key, {"id": str(key), "text": sentence.strip(), "label": int(sentiment.strip())} + elif self.config.name == f"{_DATASETNAME}_topic_seacrowd_{self.SEACROWD_SCHEMA_NAME}": + with open(sentences_path, encoding="utf-8") as sentences, open(topics_path, encoding="utf-8") as topics: + for key, (sentence, topic) in enumerate(zip(sentences, topics)): + yield key, { + "id": str(key), + "text": sentence.strip(), + "label": int(topic.strip()), + } diff --git a/seacrowd/sea_datasets/uit_vsmec/__init__.py b/seacrowd/sea_datasets/uit_vsmec/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_vsmec/uit_vsmec.py b/seacrowd/sea_datasets/uit_vsmec/uit_vsmec.py new file mode 100644 index 000000000..366250f69 --- /dev/null +++ b/seacrowd/sea_datasets/uit_vsmec/uit_vsmec.py @@ -0,0 +1,130 @@ +# coding=utf-8 +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{ho2020emotion, + title={Emotion recognition for vietnamese social media text}, + author={Ho, Vong Anh and Nguyen, Duong Huynh-Cong and Nguyen, Danh Hoang and Pham, Linh Thi-Van and Nguyen, Duc-Vu and Nguyen, Kiet Van and Nguyen, Ngan Luu-Thuy}, + booktitle={Computational Linguistics: 16th International Conference of the Pacific Association for Computational Linguistics, PACLING 2019, Hanoi, Vietnam, October 11--13, 2019, Revised Selected Papers 16}, + pages={319--333}, + year={2020}, + organization={Springer} +} +""" + +_DATASETNAME = "uit_vsmec" + +_DESCRIPTION = """\ +This dataset consists of Vietnamese Facebook comments that were manually annotated for sentiment. +There are seven possible emotion labels: enjoyment, sadness, fear, anger, disgust, surprise or other (for comments with no or neutral emotions). +Two rounds of manual annotations were done to train annotators with tagging and editing guidelines. +Annotation was performed until inter-annotator agreement reached at least 80%. +""" + +_HOMEPAGE = "https://drive.google.com/drive/folders/1HooABJyrddVGzll7fgkJ6VzkG_XuWfRu" + +_LICENSE = Licenses.UNKNOWN.value + +_LANGUAGES = ["vie"] + +_LOCAL = False + +_URLS = { + "train": "https://docs.google.com/spreadsheets/export?id=10VYzfK7JLg-vfmqH0UmKX62z_uaXU-Hp&format=csv", + "valid": "https://docs.google.com/spreadsheets/export?id=1EsSFZ94fj2yTvFKO6EyxM0wBRcG0s1KE&format=csv", + "test": "https://docs.google.com/spreadsheets/export?id=1D16FCKKgJ0T6t2aSA3biWVwvD9fa4G9a&format=csv", +} + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class UITVSMECDataset(datasets.GeneratorBasedBuilder): + """ + This is the main class of SEACrowd dataloader for UIT-VSMEC, focusing on emotion/sentiment classification task. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}", + ), + ] + LABEL_NAMES = ["Other", "Disgust", "Enjoyment", "Anger", "Surprise", "Sadness", "Fear"] + DEFAULT_CONFIG_NAME = "uit_vsmec_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"Emotion": datasets.Value("string"), "Sentence": datasets.Value("string")}) + + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(self.LABEL_NAMES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + path_dict = dl_manager.download_and_extract(_URLS) + train_path, valid_path, test_path = path_dict["train"], path_dict["valid"], path_dict["test"] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_path, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": test_path, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": valid_path, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + df = pd.read_csv(filepath).reset_index() + if self.config.schema == "source": + for row in df.itertuples(): + ex = {"Emotion": row.Emotion, "Sentence": row.Sentence} + yield row.index, ex + + elif self.config.schema == "seacrowd_text": + for row in df.itertuples(): + ex = {"id": str(row.index), "text": row.Sentence, "label": self.LABEL_NAMES.index(row.Emotion)} + yield row.index, ex diff --git a/seacrowd/sea_datasets/unimorph/__init__.py b/seacrowd/sea_datasets/unimorph/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/unimorph/unimorph.py b/seacrowd/sea_datasets/unimorph/unimorph.py new file mode 100644 index 000000000..f764265ba --- /dev/null +++ b/seacrowd/sea_datasets/unimorph/unimorph.py @@ -0,0 +1,447 @@ +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@misc{batsuren2022unimorph, + title={UniMorph 4.0: Universal Morphology}, + author={ + Khuyagbaatar Batsuren and Omer Goldman and Salam Khalifa and Nizar + Habash and Witold Kieraś and Gábor Bella and Brian Leonard and Garrett + Nicolai and Kyle Gorman and Yustinus Ghanggo Ate and Maria Ryskina and + Sabrina J. Mielke and Elena Budianskaya and Charbel El-Khaissi and Tiago + Pimentel and Michael Gasser and William Lane and Mohit Raj and Matt + Coler and Jaime Rafael Montoya Samame and Delio Siticonatzi Camaiteri + and Benoît Sagot and Esaú Zumaeta Rojas and Didier López Francis and + Arturo Oncevay and Juan López Bautista and Gema Celeste Silva Villegas + and Lucas Torroba Hennigen and Adam Ek and David Guriel and Peter Dirix + and Jean-Philippe Bernardy and Andrey Scherbakov and Aziyana Bayyr-ool + and Antonios Anastasopoulos and Roberto Zariquiey and Karina Sheifer and + Sofya Ganieva and Hilaria Cruz and Ritván Karahóǧa and Stella + Markantonatou and George Pavlidis and Matvey Plugaryov and Elena + Klyachko and Ali Salehi and Candy Angulo and Jatayu Baxi and Andrew + Krizhanovsky and Natalia Krizhanovskaya and Elizabeth Salesky and Clara + Vania and Sardana Ivanova and Jennifer White and Rowan Hall Maudslay and + Josef Valvoda and Ran Zmigrod and Paula Czarnowska and Irene Nikkarinen + and Aelita Salchak and Brijesh Bhatt and Christopher Straughn and Zoey + Liu and Jonathan North Washington and Yuval Pinter and Duygu Ataman and + Marcin Wolinski and Totok Suhardijanto and Anna Yablonskaya and Niklas + Stoehr and Hossep Dolatian and Zahroh Nuriah and Shyam Ratan and Francis + M. Tyers and Edoardo M. Ponti and Grant Aiton and Aryaman Arora and + Richard J. Hatcher and Ritesh Kumar and Jeremiah Young and Daria + Rodionova and Anastasia Yemelina and Taras Andrushko and Igor Marchenko + and Polina Mashkovtseva and Alexandra Serova and Emily Prud'hommeaux and + Maria Nepomniashchaya and Fausto Giunchiglia and Eleanor Chodroff and + Mans Hulden and Miikka Silfverberg and Arya D. McCarthy and David + Yarowsky and Ryan Cotterell and Reut Tsarfaty and Ekaterina Vylomova}, + year={2022}, + eprint={2205.03608}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_LOCAL = False +_LANGUAGES = ["ind", "kod", "ceb", "hil", "tgl"] +_DATASETNAME = "unimorph" +_DESCRIPTION = """\ +The Universal Morphology (UniMorph) project is a collaborative effort providing +broad-coverage instantiated normalized morphological inflection tables for +undreds of diverse world languages. The project comprises two major thrusts: a +language-independent feature schema for rich morphological annotation, and a +type-level resource of annotated data in diverse languages realizing that +schema. 5 Austronesian languages spoken in Southeast Asia, consisting 2 +Malayo-Polynesian languages and 3 Greater Central Philippine languages, become +the part of UniMorph 4.0 release. +""" + +_HOMEPAGE = "https://unimorph.github.io" +_LICENSE = Licenses.CC_BY_SA_3_0.value +_URL = "https://raw.githubusercontent.com/unimorph/" + +_SUPPORTED_TASKS = [Tasks.MORPHOLOGICAL_INFLECTION] +_SOURCE_VERSION = "4.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class UnimorphDataset(datasets.GeneratorBasedBuilder): + """Unimorh 4.0 dataset by Batsuren et al., (2022)""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "pairs_multi" + + dataset_names = sorted([f"{_DATASETNAME}_{lang}" for lang in _LANGUAGES]) + BUILDER_CONFIGS = [] + for name in dataset_names: + source_config = SEACrowdConfig( + name=f"{name}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=name, + ) + BUILDER_CONFIGS.append(source_config) + seacrowd_config = SEACrowdConfig( + name=f"{name}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=name, + ) + BUILDER_CONFIGS.append(seacrowd_config) + + # Add configuration that allows loading all datasets at once. + BUILDER_CONFIGS.extend( + [ + # unimorph_source + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema (all)", + schema="source", + subset_id=_DATASETNAME, + ), + # unimorph_seacrowd_pairs + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema (all)", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + ) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + # https://huggingface.co/datasets/universal_morphologies/blob/main/universal_morphologies.py + CLASS_CATEGORIES = { + "Aktionsart": ["STAT", "DYN", "TEL", "ATEL", "PCT", "DUR", "ACH", "ACCMP", "SEMEL", "ACTY"], + "Animacy": ["ANIM", "INAN", "HUM", "NHUM"], + "Argument_Marking": [ + "ARGNO1S", + "ARGNO2S", + "ARGNO3S", + "ARGNO1P", + "ARGNO2P", + "ARGNO3P", + "ARGAC1S", + "ARGAC2S", + "ARGAC3S", + "ARGAC1P", + "ARGAC2P", + "ARGAC3P", + "ARGAB1S", + "ARGAB2S", + "ARGAB3S", + "ARGAB1P", + "ARGAB2P", + "ARGAB3P", + "ARGER1S", + "ARGER2S", + "ARGER3S", + "ARGER1P", + "ARGER2P", + "ARGER3P", + "ARGDA1S", + "ARGDA2S", + "ARGDA3S", + "ARGDA1P", + "ARGDA2P", + "ARGDA3P", + "ARGBE1S", + "ARGBE2S", + "ARGBE3S", + "ARGBE1P", + "ARGBE2P", + "ARGBE3P", + ], + "Aspect": ["IPFV", "PFV", "PRF", "PROG", "PROSP", "ITER", "HAB"], + "Case": [ + "NOM", + "ACC", + "ERG", + "ABS", + "NOMS", + "DAT", + "BEN", + "PRP", + "GEN", + "REL", + "PRT", + "INS", + "COM", + "VOC", + "COMPV", + "EQTV", + "PRIV", + "PROPR", + "AVR", + "FRML", + "TRANS", + "BYWAY", + "INTER", + "AT", + "POST", + "IN", + "CIRC", + "ANTE", + "APUD", + "ON", + "ONHR", + "ONVR", + "SUB", + "REM", + "PROXM", + "ESS", + "ALL", + "ABL", + "APPRX", + "TERM", + ], + "Comparison": ["CMPR", "SPRL", "AB", "RL", "EQT"], + "Definiteness": ["DEF", "INDF", "SPEC", "NSPEC"], + "Deixis": ["PROX", "MED", "REMT", "REF1", "REF2", "NOREF", "PHOR", "VIS", "NVIS", "ABV", "EVEN", "BEL"], + "Evidentiality": ["FH", "DRCT", "SEN", "VISU", "NVSEN", "AUD", "NFH", "QUOT", "RPRT", "HRSY", "INFER", "ASSUM"], + "Finiteness": ["FIN", "NFIN"], + "Gender": [ + "MASC", + "FEM", + "NEUT", + "NAKH1", + "NAKH2", + "NAKH3", + "NAKH4", + "NAKH5", + "NAKH6", + "NAKH7", + "NAKH8", + "BANTU1", + "BANTU2", + "BANTU3", + "BANTU4", + "BANTU5", + "BANTU6", + "BANTU7", + "BANTU8", + "BANTU9", + "BANTU10", + "BANTU11", + "BANTU12", + "BANTU13", + "BANTU14", + "BANTU15", + "BANTU16", + "BANTU17", + "BANTU18", + "BANTU19", + "BANTU20", + "BANTU21", + "BANTU22", + "BANTU23", + ], + "Information_Structure": ["TOP", "FOC"], + "Interrogativity": ["DECL", "INT"], + "Language_Specific": [ + "LGSPEC1", + "LGSPEC2", + "LGSPEC3", + "LGSPEC4", + "LGSPEC5", + "LGSPEC6", + "LGSPEC7", + "LGSPEC8", + "LGSPEC9", + "LGSPEC10", + ], + "Mood": [ + "IND", + "SBJV", + "REAL", + "IRR", + "AUPRP", + "AUNPRP", + "IMP", + "COND", + "PURP", + "INTEN", + "POT", + "LKLY", + "ADM", + "OBLIG", + "DEB", + "PERM", + "DED", + "SIM", + "OPT", + ], + "Number": ["SG", "PL", "GRPL", "DU", "TRI", "PAUC", "GRPAUC", "INVN"], + "Part_Of_Speech": [ + "N", + "PROPN", + "ADJ", + "PRO", + "CLF", + "ART", + "DET", + "V", + "ADV", + "AUX", + "V.PTCP", + "V.MSDR", + "V.CVB", + "ADP", + "COMP", + "CONJ", + "NUM", + "PART", + "INTJ", + ], + "Person": ["0", "1", "2", "3", "4", "INCL", "EXCL", "PRX", "OBV"], + "Polarity": ["POS", "NEG"], + "Politeness": [ + "INFM", + "FORM", + "ELEV", + "HUMB", + "POL", + "AVOID", + "LOW", + "HIGH", + "STELEV", + "STSUPR", + "LIT", + "FOREG", + "COL", + ], + "Possession": [ + "ALN", + "NALN", + "PSS1S", + "PSS2S", + "PSS2SF", + "PSS2SM", + "PSS2SINFM", + "PSS2SFORM", + "PSS3S", + "PSS3SF", + "PSS3SM", + "PSS1D", + "PSS1DI", + "PSS1DE", + "PSS2D", + "PSS2DM", + "PSS2DF", + "PSS3D", + "PSS3DF", + "PSS3DM", + "PSS1P", + "PSS1PI", + "PSS1PE", + "PSS2P", + "PSS2PF", + "PSS2PM", + "PSS3PF", + "PSS3PM", + ], + "Switch_Reference": ["SS", "SSADV", "DS", "DSADV", "OR", "SIMMA", "SEQMA", "LOG"], + "Tense": ["PRS", "PST", "FUT", "IMMED", "HOD", "1DAY", "RCT", "RMT"], + "Valency": ["IMPRS", "INTR", "TR", "DITR", "REFL", "RECP", "CAUS", "APPL"], + "Voice": ["ACT", "MID", "PASS", "ANTIP", "DIR", "INV", "AGFOC", "PFOC", "LFOC", "BFOC", "ACFOC", "IFOC", "CFOC"], + } + + TAG_TO_CAT = dict([(tag, cat) for cat, tags in CLASS_CATEGORIES.items() for tag in tags]) + CLASS_LABELS = [feat for _, category in CLASS_CATEGORIES.items() for feat in category] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "lemma": datasets.Value("string"), + "forms": datasets.Sequence( + dict( + [("word", datasets.Value("string"))] + + [(cat, datasets.Sequence(datasets.ClassLabel(names=tasks))) for cat, tasks in self.CLASS_CATEGORIES.items()] + + [("Other", datasets.Sequence(datasets.Value("string")))] # for misspecified tags + ) + ), + } + ) + + if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + all_features = [feat for _, category in self.CLASS_CATEGORIES.items() for feat in category] + features = schemas.pairs_multi_features(label_names=self.CLASS_LABELS) + + return datasets.DatasetInfo(description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + """Return SplitGenerators.""" + source_data = [] + + lang = self.config.name.split("_")[1] + if lang in _LANGUAGES: + # Load data per language + source_data.append(dl_manager.download_and_extract(_URL + f"{lang}/main/{lang}")) + else: + # Load examples from all languages at once. + for lang in _LANGUAGES: + source_data.append(dl_manager.download_and_extract(_URL + f"{lang}/main/{lang}")) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepaths": source_data, + }, + ) + ] + + def _generate_examples(self, filepaths: List[Path]) -> Tuple[int, Dict]: + """Yield examples as (key, example) tuples""" + + all_forms: Dict[str, List[Dict[str, Any]]] = {} + for source_file in filepaths: + with open(source_file, encoding="utf-8") as file: + for row in file: + if row.strip() == "" or row.strip().startswith("#"): + continue + lemma, word, tags = row.strip().split("\t") + all_forms[lemma] = all_forms.get(lemma, []) + tag_list = tags.replace("NDEF", "INDF").split(";") + form = dict([("word", word), ("Other", [])] + [(cat, []) for cat, tasks in self.CLASS_CATEGORIES.items()]) + for tag_pre in tag_list: + tag = tag_pre.split("+") + if tag[0] in self.TAG_TO_CAT: + form[self.TAG_TO_CAT[tag[0]]] = tag + else: + form["Other"] += tag + all_forms[lemma] += [form] + + if self.config.schema == "source": + for id_, (lemma, forms) in enumerate(all_forms.items()): + res = {"lemma": lemma, "forms": {}} + for k in ["word", "Other"] + list(self.CLASS_CATEGORIES.keys()): + res["forms"][k] = [form[k] for form in forms] + yield id_, res + + if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + idx = 0 + for lemma, forms in all_forms.items(): + for form in forms: + inflection = form.pop("word") + feats = [feat[0] for feat in list(form.values()) if feat and feat[0] in self.CLASS_LABELS] + example = { + "id": idx, + "text_1": lemma, + "text_2": inflection, + "label": feats, + } + idx += 1 + yield idx, example diff --git a/seacrowd/sea_datasets/vi_pubmed/__init__.py b/seacrowd/sea_datasets/vi_pubmed/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vi_pubmed/vi_pubmed.py b/seacrowd/sea_datasets/vi_pubmed/vi_pubmed.py new file mode 100644 index 000000000..cdb2f3477 --- /dev/null +++ b/seacrowd/sea_datasets/vi_pubmed/vi_pubmed.py @@ -0,0 +1,260 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks + +_CITATION = """\ +@misc{mtet, + doi = {10.48550/ARXIV.2210.05610}, + url = {https://arxiv.org/abs/2210.05610}, + author = {Ngo, Chinh and Trinh, Trieu H. and Phan, Long and Tran, Hieu and Dang, Tai and Nguyen, Hieu and Nguyen, Minh and Luong, Minh-Thang}, + keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {MTet: Multi-domain Translation for English and Vietnamese}, + publisher = {arXiv}, + year = {2022}, + copyright = {Creative Commons Attribution 4.0 International} +} +""" + +_DATASETNAME = "vi_pubmed" + +_DESCRIPTION = """\ +20M Vietnamese PubMed biomedical abstracts translated by the state-of-the-art English-Vietnamese Translation project. The data has been used as unlabeled dataset for pretraining a Vietnamese Biomedical-domain Transformer model. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/VietAI/vi_pubmed" + +_LANGUAGES = ["eng", "vie"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.OTHERS.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + "pubmed22": [ + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/0.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/1.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/2.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/3.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/4.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/5.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/6.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/7.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/8.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/9.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/10.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/11.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/12.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/13.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/14.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/15.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/16.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/17.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/18.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/19.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/20.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/21.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/22.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/23.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/24.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/25.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/26.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/27.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/28.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/29.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/30.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/31.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/32.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/33.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/34.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/35.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/36.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/37.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/38.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/39.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/40.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/41.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/42.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/43.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/44.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/45.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/46.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/47.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/48.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/49.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/50.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/51.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/52.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/53.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/54.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/55.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/56.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/57.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/58.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/59.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/60.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/61.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/62.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/63.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/64.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/65.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/66.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/67.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/68.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/69.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/70.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/71.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/72.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/73.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/74.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/75.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/76.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/77.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/78.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/79.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/80.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/81.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/82.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/83.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/84.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/85.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/86.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/87.parquet", + "https://huggingface.co/api/datasets/VietAI/vi_pubmed/parquet/default/pubmed22/88.parquet", + ] + }, +} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] +_SUPPORTED_SCHEMA_STRINGS = [f"seacrowd_{str(TASK_TO_SCHEMA[task]).lower()}" for task in _SUPPORTED_TASKS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class ViPubmed(datasets.GeneratorBasedBuilder): + """20M Vietnamese PubMed biomedical abstracts translated by the state-of-the-art English-Vietnamese Translation project. The data has been used as unlabeled dataset for pretraining a Vietnamese Biomedical-domain Transformer model.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + ] + + seacrowd_schema_config: list[SEACrowdConfig] = [] + + for seacrowd_schema in _SUPPORTED_SCHEMA_STRINGS: + + seacrowd_schema_config.append( + SEACrowdConfig( + name=f"{_DATASETNAME}_{seacrowd_schema}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {seacrowd_schema} schema", + schema=f"{seacrowd_schema}", + subset_id=f"{_DATASETNAME}", + ) + ) + + BUILDER_CONFIGS.extend(seacrowd_schema_config) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "en": datasets.Value("string"), + "vi": datasets.Value("string"), + } + ) + + elif self.config.schema == f"seacrowd_{str(TASK_TO_SCHEMA[Tasks.MACHINE_TRANSLATION]).lower()}": + features = schemas.text2text_features + + else: + raise ValueError(f"Invalid config: {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + split_name = "pubmed22" + paths = dl_manager.download_and_extract(_URLS[_DATASETNAME][split_name]) + + return [ + datasets.SplitGenerator( + name=split_name, + gen_kwargs={ + "paths": paths, + "split": split_name, + }, + ), + ] + + def _generate_examples(self, paths: list[Path], split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + idx = 0 + + if self.config.schema == "source": + + for path in paths: + df = pd.read_parquet(path) + + for _, row in df.iterrows(): + yield idx, row.to_dict() + idx += 1 + + elif self.config.schema == f"seacrowd_{str(TASK_TO_SCHEMA[Tasks.MACHINE_TRANSLATION]).lower()}": + for path in paths: + df = pd.read_parquet(path) + + df["id"] = df.index + idx + df.rename(columns={"en": "text_1"}, inplace=True) + df.rename(columns={"vi": "text_2"}, inplace=True) + df = df.assign(text_1_name="en").astype({"text_1_name": "str"}) + df = df.assign(text_2_name="vi").astype({"text_2_name": "str"}) + + for _, row in df.iterrows(): + yield idx, row.to_dict() + idx += 1 + + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/seacrowd/sea_datasets/vihealthqa/__init__.py b/seacrowd/sea_datasets/vihealthqa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vihealthqa/vihealthqa.py b/seacrowd/sea_datasets/vihealthqa/vihealthqa.py new file mode 100644 index 000000000..baa3188e5 --- /dev/null +++ b/seacrowd/sea_datasets/vihealthqa/vihealthqa.py @@ -0,0 +1,157 @@ +# coding=utf-8 +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@InProceedings{nguyen2022viheathqa, + author="Nguyen, Nhung Thi-Hong + and Ha, Phuong Phan-Dieu + and Nguyen, Luan Thanh + and Van Nguyen, Kiet + and Nguyen, Ngan Luu-Thuy", + title="SPBERTQA: A Two-Stage Question Answering System Based on Sentence Transformers for Medical Texts", + booktitle="Knowledge Science, Engineering and Management", + year="2022", + publisher="Springer International Publishing", + address="Cham", + pages="371--382", + isbn="978-3-031-10986-7" +} +""" +_DATASETNAME = "vihealthqa" +_DESCRIPTION = """\ +Vietnamese Visual Question Answering (ViVQA) consist of 10328 images and 15000 question-answer +pairs in Vietnamese for evaluating Vietnamese VQA models. This dataset is built based on 10328 randomly +selected images from MS COCO dataset. The question-answer pairs were based on the COCO-QA dataset that +was automatically translated from English to Vietnamese. +""" +_HOMEPAGE = "https://huggingface.co/datasets/tarudesu/ViHealthQA" +_LANGUAGES = ["vie"] +_LICENSE = Licenses.UNKNOWN.value +_PAPER_URL = "https://link.springer.com/chapter/10.1007/978-3-031-10986-7_30" +_LOCAL = False +_URLS = { + "vihealthqa": { + "train": "https://huggingface.co/datasets/tarudesu/ViHealthQA/raw/main/train.csv", + "val": "https://huggingface.co/datasets/tarudesu/ViHealthQA/raw/main/val.csv", + "test": "https://huggingface.co/datasets/tarudesu/ViHealthQA/raw/main/test.csv", + } +} +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ViHealthQADataset(datasets.GeneratorBasedBuilder): + ''' +This is a SeaCrowed dataloader for dataset Vietnamese Visual Question Answering (ViVQA), which consists of 10328 images and 15000 question-answer +pairs in Vietnamese for evaluating Vietnamese VQA models. + ''' + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_qa", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "question": datasets.Value("string"), + "answer": datasets.Value("string"), + "link": datasets.Value("string") + } + ) + elif self.config.schema == "seacrowd_qa": + features = schemas.qa_features + features["meta"] = {"link": datasets.Value("string")} + else: + raise ValueError(f"No schema matched for {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS["vihealthqa"] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir["train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir["val"], + "split": "val", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir["test"], + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + raw_examples = pd.read_csv(filepath) + + for eid, exam in raw_examples.iterrows(): + assert len(exam) == 4 + exam_id, exam_quest, exam_answer, exam_link = exam + + if self.config.schema == "source": + yield eid, {"id": str(exam_id), "question": exam_quest, "answer": exam_answer, "link": exam_link} + + elif self.config.schema == "seacrowd_qa": + yield eid, { + "id": str(eid), + "question_id": exam_id, + "document_id": str(eid), + "question": exam_quest, + "type": None, + "choices": [], + "context": exam_link, + "answer": [exam_answer], + "meta": { + "link": exam_link, + }, + } diff --git a/seacrowd/sea_datasets/visobert/__init__.py b/seacrowd/sea_datasets/visobert/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/visobert/visobert.py b/seacrowd/sea_datasets/visobert/visobert.py new file mode 100644 index 000000000..ac7dd7bde --- /dev/null +++ b/seacrowd/sea_datasets/visobert/visobert.py @@ -0,0 +1,158 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{nguyen-etal-2023-visobert, + title = "{V}i{S}o{BERT}: A Pre-Trained Language Model for {V}ietnamese Social Media Text Processing", + author = "Nguyen, Nam and + Phan, Thang and + Nguyen, Duc-Vu and + Nguyen, Kiet", + editor = "Bouamor, Houda and + Pino, Juan and + Bali, Kalika", + booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing", + month = dec, + year = "2023", + address = "Singapore", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.emnlp-main.315", + pages = "5191--5207", + abstract = "English and Chinese, known as resource-rich languages, have witnessed the strong + development of transformer-based language models for natural language processing tasks. Although + Vietnam has approximately 100M people speaking Vietnamese, several pre-trained models, e.g., PhoBERT, + ViBERT, and vELECTRA, performed well on general Vietnamese NLP tasks, including POS tagging and + named entity recognition. These pre-trained language models are still limited to Vietnamese social + media tasks. In this paper, we present the first monolingual pre-trained language model for + Vietnamese social media texts, ViSoBERT, which is pre-trained on a large-scale corpus of high-quality + and diverse Vietnamese social media texts using XLM-R architecture. Moreover, we explored our + pre-trained model on five important natural language downstream tasks on Vietnamese social media + texts: emotion recognition, hate speech detection, sentiment analysis, spam reviews detection, and + hate speech spans detection. Our experiments demonstrate that ViSoBERT, with far fewer parameters, + surpasses the previous state-of-the-art models on multiple Vietnamese social media tasks. Our + ViSoBERT model is available only for research purposes. Disclaimer: This paper contains actual + comments on social networks that might be construed as abusive, offensive, or obscene.", +} +""" + +_DATASETNAME = "visobert" + +_DESCRIPTION = """\ +The ViSoBERT corpus is composed of Vietnamese textual data crawled from Facebook, TikTok, and YouTube. The +dataset contains Facebook posts, TikTok comments, and Youtube comments of Vietnamese-verified users, from +Jan 2016 (Jan 2020 for TikTok) to Dec 2022. A post-processing mechanism is applied to handles hashtags, +emojis, misspellings, hyperlinks, and other noncanonical texts. +""" + +_HOMEPAGE = "https://huggingface.co/uitnlp/visobert" + +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_LOCAL = False + +_URLS = "https://drive.usercontent.google.com/download?id=1BoiR9k2DrjBcd2aHy5BOq4haEp5V2_ug&confirm=xxx" + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class ViSoBERTDataset(datasets.GeneratorBasedBuilder): + """ + The ViSoBERT corpus is a Vietnamese pretraining dataset from https://huggingface.co/uitnlp/visobert. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_ssp", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_ssp", + subset_id=f"{_DATASETNAME}", + ), + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source" or self.config.schema == "seacrowd_ssp": + features = schemas.self_supervised_pretraining.features + else: + raise ValueError(f"Invalid schema: '{self.config.schema}'") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + path = dl_manager.download(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": path, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + + with open(filepath, "r", encoding="utf-8") as f: + if self.config.schema == "source" or self.config.schema == "seacrowd_ssp": + for idx, row in enumerate(f): + if row.strip() != "": + yield ( + idx, + { + "id": str(idx), + "text": row.strip(), + }, + ) + else: + raise ValueError(f"Invalid config: '{self.config.name}'") \ No newline at end of file diff --git a/seacrowd/sea_datasets/vispamreviews/__init__.py b/seacrowd/sea_datasets/vispamreviews/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py new file mode 100644 index 000000000..4bbd1871d --- /dev/null +++ b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py @@ -0,0 +1,179 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Dict, List, Tuple + +import datasets +import pandas + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@InProceedings{10.1007/978-3-031-21743-2_48, +author="Van Dinh, Co +and Luu, Son T. +and Nguyen, Anh Gia-Tuan", +editor="Nguyen, Ngoc Thanh +and Tran, Tien Khoa +and Tukayev, Ualsher +and Hong, Tzung-Pei +and Trawi{\'{n}}ski, Bogdan +and Szczerbicki, Edward", +title="Detecting Spam Reviews on Vietnamese E-Commerce Websites", +booktitle="Intelligent Information and Database Systems", +year="2022", +publisher="Springer International Publishing", +address="Cham", +pages="595--607", +abstract="The reviews of customers play an essential role in online shopping. +People often refer to reviews or comments of previous customers to decide whether +to buy a new product. Catching up with this behavior, some people create untruths and +illegitimate reviews to hoax customers about the fake quality of products. These are called +spam reviews, confusing consumers on online shopping platforms and negatively affecting online +shopping behaviors. We propose the dataset called ViSpamReviews, which has a strict annotation +procedure for detecting spam reviews on e-commerce platforms. Our dataset consists of two tasks: +the binary classification task for detecting whether a review is spam or not and the multi-class +classification task for identifying the type of spam. The PhoBERT obtained the highest results on +both tasks, 86.89%, and 72.17%, respectively, by macro average F1 score.", +isbn="978-3-031-21743-2" +} +""" + +_LOCAL = False +_LANGUAGES = ["vie"] +_DATASETNAME = "vispamreviews" +_DESCRIPTION = """ +The dataset was collected from leading online shopping platforms in Vietnam. Some of the most recent +selling products for each product category were selected and up to 15 reviews per product were collected. +Each review was then labeled as either NO-SPAM, SPAM-1 (fake review), SPAM-2 (review on brand only), or +SPAM-3 (irrelevant content). +""" + +_HOMEPAGE = "https://github.com/sonlam1102/vispamdetection/" +_LICENSE = Licenses.CC_BY_NC_4_0.value +_URL = "https://raw.githubusercontent.com/sonlam1102/vispamdetection/main/dataset/vispamdetection_dataset.zip" + +_Split_Path = { + "train": "dataset/train.csv", + "validation": "dataset/dev.csv", + "test": "dataset/test.csv", +} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] # Text Classification +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder): + """ + The SeaCrowd dataloader for the review dataset shopping platforms in Vietnam (ViSpamReviews). + """ + + CLASS_LABELS = [0, 1] + SPAM_TYPE_LABELS = [0, 1, 2, 3] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_spam_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema ", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_spam_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema ", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}_spam", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.name.endswith("source"): + features = (datasets.Features + ( + {"id": datasets.Value("int32"), + "text": datasets.Value("string"), + "label": datasets.Value("string"), + "spam_label": datasets.Value("string"), + "rating": datasets.Value("int32") + } + )) + + elif self.config.name == "vispamreviews_seacrowd_text": + features = schemas.text_features(label_names=self.CLASS_LABELS) + elif self.config.name == "vispamreviews_spam_seacrowd_text": + features = schemas.text_features(label_names=self.SPAM_TYPE_LABELS) + else: + raise ValueError(f"Invalid schema {self.config.name}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + file_paths = dl_manager.download_and_extract(_URL) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["train"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["validation"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["test"])}, + ), + ] + + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data_lines = pandas.read_csv(filepath) + for rid, row in enumerate(data_lines.itertuples()): + if self.config.name.endswith("source"): + example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel, + "rating": row.Rating} + elif self.config.name == "vispamreviews_seacrowd_text": + example = {"id": str(rid), "text": row.Comment, "label": row.Label} + elif self.config.name == "vispamreviews_spam_seacrowd_text": + example = {"id": str(rid), "text": row.Comment, "label": row.SpamLabel} + else: + raise ValueError(f"Invalid schema {self.config.schema}") + yield rid, example diff --git a/seacrowd/sea_datasets/vistec_tp_th_21/__init__.py b/seacrowd/sea_datasets/vistec_tp_th_21/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vistec_tp_th_21/vistec_tp_th_21.py b/seacrowd/sea_datasets/vistec_tp_th_21/vistec_tp_th_21.py new file mode 100644 index 000000000..e1b213474 --- /dev/null +++ b/seacrowd/sea_datasets/vistec_tp_th_21/vistec_tp_th_21.py @@ -0,0 +1,183 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{limkonchotiwat-etal-2021-handling, + title = "Handling Cross- and Out-of-Domain Samples in {T}hai Word Segmentation", + author = "Limkonchotiwat, Peerat and + Phatthiyaphaibun, Wannaphong and + Sarwar, Raheem and + Chuangsuwanich, Ekapol and + Nutanong, Sarana", + booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.findings-acl.86", + doi = "10.18653/v1/2021.findings-acl.86", + pages = "1003--1016", +} +""" + +_DATASETNAME = "vistec_tp_th_21" + +_DESCRIPTION = """\ +The largest social media domain datasets for Thai text processing (word segmentation, +misspell correction and detection, and named-entity boundary) called "VISTEC-TP-TH-2021" or VISTEC-2021. +VISTEC corpus contains 49,997 sentences with 3.39M words where the collection was manually annotated by +linguists on four tasks, namely word segmentation, misspelling detection and correction, +and named entity recognition. +""" + +_HOMEPAGE = "https://github.com/mrpeerat/OSKut/tree/main/VISTEC-TP-TH-2021" + + +_LANGUAGES = ["tha"] + + +_LICENSE = Licenses.CC_BY_SA_3_0.value + +_LOCAL = False + +_URLS = { + "train": "https://raw.githubusercontent.com/mrpeerat/OSKut/main/VISTEC-TP-TH-2021/train/VISTEC-TP-TH-2021_train_proprocessed.txt", + "test": "https://raw.githubusercontent.com/mrpeerat/OSKut/main/VISTEC-TP-TH-2021/test/VISTEC-TP-TH-2021_test_proprocessed.txt", +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class VISTEC21Dataset(datasets.GeneratorBasedBuilder): + """ + The largest social media domain datasets for Thai text processing (word segmentation, + misspell correction and detection, and named-entity boundary) called "VISTEC-TP-TH-2021" or VISTEC-2021. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "seq_label" + LABEL_CLASSES = ["0", "1"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "ner_tags": datasets.Sequence(datasets.features.ClassLabel(names=self.LABEL_CLASSES)), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.seq_label_features(self.LABEL_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_files = { + "train": Path(dl_manager.download_and_extract(_URLS["train"])), + "test": Path(dl_manager.download_and_extract(_URLS["test"])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_files["train"], "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_files["test"], "split": "test"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + label_key = "ner_tags" if self.config.schema == "source" else "labels" + + with open(filepath, "r", encoding="utf-8") as f: + lines = f.readlines() + id = 0 + for line in lines: + tokens = line.split("|") + token_list = [] + ner_tag = [] + for token in tokens: + if "" in token: + token = token.replace("", "") + token = token.replace("", "") + token_list.append(token) + ner_tag.append(1) + continue + if "" in token and "]*)>", token)[0]) + ner_tag.append(0) + continue + if "" in token or "" in token: + token = token.replace("", "") + token = token.replace("", "") + token_list.append(token) + ner_tag.append(0) + continue + token_list.append(token) + ner_tag.append(0) + id += 1 + yield id, { + "id": str(id), + "tokens": token_list, + label_key: ner_tag, + } diff --git a/seacrowd/sea_datasets/vitext2sql/vitext2sql.py b/seacrowd/sea_datasets/vitext2sql/vitext2sql.py index ad8be6de4..a9b3cd14f 100644 --- a/seacrowd/sea_datasets/vitext2sql/vitext2sql.py +++ b/seacrowd/sea_datasets/vitext2sql/vitext2sql.py @@ -57,6 +57,8 @@ }, } +_LOCAL = False +_LANGUAGES = ["vie"] _SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] _SEACROWD_VERSION = "1.0.0" diff --git a/seacrowd/sea_datasets/vivos/__init__.py b/seacrowd/sea_datasets/vivos/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vivos/vivos.py b/seacrowd/sea_datasets/vivos/vivos.py new file mode 100644 index 000000000..352dca2b1 --- /dev/null +++ b/seacrowd/sea_datasets/vivos/vivos.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{luong-vu-2016-non, + title = "A non-expert {K}aldi recipe for {V}ietnamese Speech Recognition System", + author = "Luong, Hieu-Thi and + Vu, Hai-Quan", + editor = "Murakami, Yohei and + Lin, Donghui and + Ide, Nancy and + Pustejovsky, James", + booktitle = "Proceedings of the Third International Workshop on Worldwide Language Service + Infrastructure and Second Workshop on Open Infrastructures and Analysis Frameworks for + Human Language Technologies ({WLSI}/{OIAF}4{HLT}2016)", + month = dec, + year = "2016", + address = "Osaka, Japan", + publisher = "The COLING 2016 Organizing Committee", + url = "https://aclanthology.org/W16-5207", + pages = "51--55", + abstract = "In this paper we describe a non-expert setup for Vietnamese speech recognition + system using Kaldi toolkit. We collected a speech corpus over fifteen hours from about fifty + Vietnamese native speakers and using it to test the feasibility of our setup. The essential + linguistic components for the Automatic Speech Recognition (ASR) system was prepared basing + on the written form of the language instead of expertise knowledge on linguistic and phonology + as commonly seen in rich resource languages like English. The modeling of tones by integrating + them into the phoneme and using the phonetic decision tree is also discussed. Experimental + results showed this setup for ASR systems does yield competitive results while still have + potentials for further improvements.", +} +""" + +_DATASETNAME = "vivos" + +_DESCRIPTION = """\ +VIVOS is a Vietnamese speech corpus consisting of 15 hours of recording speech prepared for +Automatic Speech Recognition task. This speech corpus is collected by recording speech data +from more than 50 native Vietnamese volunteers. +""" + +_HOMEPAGE = "https://zenodo.org/records/7068130" + +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_LOCAL = False + +_URLS = { + "audio": "https://huggingface.co/datasets/vivos/resolve/main/data/vivos.tar.gz", + "train_prompt": "https://huggingface.co/datasets/vivos/resolve/main/data/prompts-train.txt.gz", + "test_prompt": "https://huggingface.co/datasets/vivos/resolve/main/data/prompts-test.txt.gz", +} + +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +logger = datasets.logging.get_logger(__name__) + + +class VIVOSDataset(datasets.GeneratorBasedBuilder): + """ + VIVOS is a Vietnamese speech corpus from https://zenodo.org/records/7068130. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_sptext", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_sptext", + subset_id=f"{_DATASETNAME}", + ), + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "speaker_id": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "sentence": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_sptext": + features = schemas.speech_text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + audio_path = dl_manager.download(_URLS["audio"]) + train_prompt_path = dl_manager.download_and_extract(_URLS["train_prompt"]) + test_prompt_path = dl_manager.download_and_extract(_URLS["test_prompt"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "prompts_path": train_prompt_path, + "clips_path": "vivos/train/waves", + "audio_files": dl_manager.iter_archive(audio_path), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "prompts_path": test_prompt_path, + "clips_path": "vivos/test/waves", + "audio_files": dl_manager.iter_archive(audio_path), + "split": "test", + }, + ), + ] + + def _generate_examples(self, prompts_path: Path, clips_path: Path, audio_files, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + examples = {} + with open(prompts_path, encoding="utf-8") as f: + if self.config.schema == "source": + for row in f: + data = row.strip().split(" ", 1) + speaker_id = data[0].split("_")[0] + audio_path = "/".join([clips_path, speaker_id, data[0] + ".wav"]) + examples[audio_path] = { + "speaker_id": speaker_id, + "path": audio_path, + "sentence": data[1], + } + elif self.config.schema == "seacrowd_sptext": + audio_id = 0 + for row in f: + data = row.strip().split(" ", 1) + speaker_id = data[0].split("_")[0] + audio_path = "/".join([clips_path, speaker_id, data[0] + ".wav"]) + examples[audio_path] = { + "id": audio_id, + "path": audio_path, + "text": data[1], + "speaker_id": speaker_id, + "metadata": { + "speaker_age": None, + "speaker_gender": None, + }, + } + audio_id += 1 + + idx = 0 + for path, f in audio_files: + if path.startswith(clips_path): + if path in examples: + audio = {"path": path, "bytes": f.read()} + yield idx, {**examples[path], "audio": audio} + idx += 1 + else: + continue \ No newline at end of file diff --git a/seacrowd/sea_datasets/vivqa/__init__.py b/seacrowd/sea_datasets/vivqa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vivqa/vivqa.py b/seacrowd/sea_datasets/vivqa/vivqa.py new file mode 100644 index 000000000..476ecd2a3 --- /dev/null +++ b/seacrowd/sea_datasets/vivqa/vivqa.py @@ -0,0 +1,218 @@ +# coding=utf-8 +import json +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@inproceedings{tran2021vivqa, + title={ViVQA: Vietnamese visual question answering}, + author={Tran, Khanh Quoc and Nguyen, An Trong and Le, An Tran-Hoai and Van Nguyen, Kiet}, + booktitle={Proceedings of the 35th Pacific Asia Conference on Language, Information and Computation}, + pages={683--691}, + year={2021} +} +""" +_DATASETNAME = "vivqa" +_DESCRIPTION = """\ +Vietnamese Visual Question Answering (ViVQA) consist of 10328 images and 15000 question-answer +pairs in Vietnamese for evaluating Vietnamese VQA models. This dataset is built based on 10328 randomly +selected images from MS COCO dataset. The question-answer pairs were based on the COCO-QA dataset that +was automatically translated from English to Vietnamese. +""" +_HOMEPAGE = "https://github.com/kh4nh12/ViVQA" +_LANGUAGES = ["vie"] +_LICENSE = Licenses.UNKNOWN.value +_LOCAL = False +_URLS = { + "viviq": {"train": "https://raw.githubusercontent.com/kh4nh12/ViVQA/main/train.csv", + "test": "https://raw.githubusercontent.com/kh4nh12/ViVQA/main/test.csv"}, + "cocodata": { + "coco2014_train_val_annots": "http://images.cocodataset.org/annotations/annotations_trainval2014.zip", + "coco2014_train_images": "http://images.cocodataset.org/zips/train2014.zip", + "coco2014_val_images": "http://images.cocodataset.org/zips/val2014.zip", + }, +} +_SUPPORTED_TASKS = [Tasks.VISUAL_QUESTION_ANSWERING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class VivQADataset(datasets.GeneratorBasedBuilder): + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_imqa", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_imqa", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "img_id": datasets.Value("string"), + "question": datasets.Value("string"), + "answer": datasets.Value("string"), + "type": datasets.Value("string"), + "coco_url": datasets.Value("string"), + "flickr_url": datasets.Value("string"), + "img_name": datasets.Value("string"), + "coco_license": datasets.Value("int32"), + "coco_width": datasets.Value("int32"), + "coco_height": datasets.Value("int32"), + "coco_date_captured": datasets.Value("string"), + "image_path": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_imqa": + features = schemas.imqa_features + features["meta"] = { + "coco_img_id": datasets.Value("string"), + "type": datasets.Value("string"), + "flickr_url": datasets.Value("string"), + "coco_url": datasets.Value("string"), + "img_name": datasets.Value("string"), + "coco_license": datasets.Value("int32"), + "coco_width": datasets.Value("int32"), + "coco_height": datasets.Value("int32"), + "coco_date_captured": datasets.Value("string"), + "image_path": datasets.Value("string"), + } + else: + raise ValueError(f"No schema matched for {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS["viviq"] + data_dir = dl_manager.download_and_extract(urls) + cocodata = dl_manager.download_and_extract(_URLS["cocodata"]) + Coco_Dict = self._get_image_detail(cocodata) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir["train"], + "split": "train", + "coco_dict": Coco_Dict, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir["test"], + "split": "test", + "coco_dict": Coco_Dict, + }, + ), + ] + + def _get_image_detail(self, coco_dir) -> Dict: + coco2014_train_val_annots = os.path.join(coco_dir["coco2014_train_val_annots"], "annotations") + train_ann_2014_path = os.path.join(coco2014_train_val_annots, "captions_train2014.json") + val_ann_2014_path = os.path.join(coco2014_train_val_annots, "captions_val2014.json") + coco_dict_val = {itm["id"]: itm for itm in json.load(open(val_ann_2014_path, "r"))["images"]} + coco_dict_train = {itm["id"]: itm for itm in json.load(open(train_ann_2014_path, "r"))["images"]} + coco_train_path = os.path.join(coco_dir["coco2014_train_images"], "train2014") + coco_val_path = os.path.join(coco_dir["coco2014_val_images"], "val2014") + coco_dict = {"train": coco_dict_train, "val": coco_dict_val, "coco_train_path": coco_train_path, "coco_val_path": coco_val_path} + + return coco_dict + + def _generate_examples(self, filepath: Path, split: str, coco_dict: Dict = None) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + raw_examples = pd.read_csv(filepath) + coco_train_ref = coco_dict["train"] + coco_val_ref = coco_dict["val"] + coco_ref = {**coco_train_ref, **coco_val_ref} + coco_train_path = coco_dict["coco_train_path"] + coco_val_path = coco_dict["coco_val_path"] + + for eid, exam in raw_examples.iterrows(): + assert len(exam) == 5 + exam_id, exam_quest, exam_answer, exam_img_id, exam_type = exam + coco_info = coco_ref[exam_img_id] + flickr_url = coco_info["flickr_url"] + img_name = coco_info["file_name"] + coco_url = coco_info["coco_url"] + coco_license = coco_info["license"] + coco_width = coco_info["width"] + coco_height = coco_info["height"] + coco_date_captured = coco_info["date_captured"] + coco_path = coco_train_path if exam_img_id in coco_train_ref else coco_val_path + image_path = os.path.join(coco_path, img_name) + + if self.config.schema == "source": + yield eid, { + "img_id": str(exam_img_id), + "question": exam_quest, + "answer": exam_answer, + "type": exam_type, + "coco_url": coco_url, + "flickr_url": flickr_url, + "img_name": img_name, + "coco_license": coco_license, + "coco_width": coco_width, + "coco_height": coco_height, + "coco_date_captured": coco_date_captured, + "image_path": image_path, + } + + elif self.config.schema == "seacrowd_imqa": + example = { + "id": str(eid), + "question_id": str(exam_id), + "document_id": str(eid), + "questions": [exam_quest], + "type": None, + "choices": None, + "context": None, + "answer": [exam_answer], + "image_paths": [image_path], + "meta": { + "coco_img_id": str(exam_img_id), + "type": exam_type, + "flickr_url": flickr_url, + "coco_url": coco_url, + "img_name": img_name, + "coco_license": coco_license, + "coco_width": coco_width, + "coco_height": coco_height, + "coco_date_captured": coco_date_captured, + "image_path": image_path, + }, + } + + yield eid, example diff --git a/seacrowd/sea_datasets/vlsp2016_ner/__init__.py b/seacrowd/sea_datasets/vlsp2016_ner/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vlsp2016_ner/vlsp2016_ner.py b/seacrowd/sea_datasets/vlsp2016_ner/vlsp2016_ner.py new file mode 100644 index 000000000..af50849f0 --- /dev/null +++ b/seacrowd/sea_datasets/vlsp2016_ner/vlsp2016_ner.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This dataset is collected from electronic newspapers published on the web and provided by VLSP organization.\ +It consists of approximately 15k sentences, each of which contain NE information in the IOB annotation format\ +""" +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{nguyen-et-al-2019-vlsp-ner, +author = {Nguyen, Huyen and Ngo, Quyen and Vu, Luong and Mai, Vu and Nguyen, Hien}, +year = {2019}, +month = {01}, +pages = {283-294}, +title = {VLSP Shared Task: Named Entity Recognition}, +volume = {34}, +journal = {Journal of Computer Science and Cybernetics}, +doi = {10.15625/1813-9663/34/4/13161} +} +""" + +_DATASETNAME = "vlsp2016_ner" + +_DESCRIPTION = """\ +This dataset is collected from electronic newspapers published on the web and provided by VLSP organization. \ +It consists of approximately 15k sentences, each of which contain NE information in the IOB annotation format +""" + +_HOMEPAGE = "https://huggingface.co/datasets/datnth1709/VLSP2016-NER-data" + +_LANGUAGES = ["vie"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: { + "train": "https://huggingface.co/datasets/datnth1709/VLSP2016-NER-data/resolve/main/data/train-00000-of-00001-b0417886a268b83a.parquet?download=true", + "test": "https://huggingface.co/datasets/datnth1709/VLSP2016-NER-data/resolve/main/data/valid-00000-of-00001-846411c236133ba3.parquet?download=true", + }, +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class Visp2016NER(datasets.GeneratorBasedBuilder): + """This dataset is collected from electronic newspapers published on the web and provided by VLSP organization. + It consists of approximately 15k sentences, each of which contain NE information in the IOB annotation format""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="vlsp2016_ner_source", + version=SOURCE_VERSION, + description="vlsp2016_ner source schema", + schema="source", + subset_id="vlsp2016_ner", + ), + SEACrowdConfig( + name="vlsp2016_ner_seacrowd_seq_label", + version=SEACROWD_VERSION, + description="vlsp2016_ner SEACrowd schema", + schema="seacrowd_seq_label", + subset_id="vlsp2016_ner", + ), + ] + + DEFAULT_CONFIG_NAME = "vlsp2016_ner_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "tokens": datasets.Sequence(datasets.Value("string")), + "ner_tags": datasets.Sequence(datasets.Value("int64")), + } + ) + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label.features([x for x in range(9)]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + train_url = _URLS[_DATASETNAME]["train"] + train_path = dl_manager.download_and_extract(train_url) + + test_url = _URLS[_DATASETNAME]["test"] + test_path = dl_manager.download_and_extract(test_url) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_path, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": test_path, + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + df = pd.read_parquet(filepath) + if self.config.schema == "source": + for i in range(len(df)): + row = df.iloc[i] + yield ( + i, + { + "tokens": row["tokens"], + "ner_tags": row["ner_tags"], + }, + ) + elif self.config.schema == "seacrowd_seq_label": + for i in range(len(df)): + row = df.iloc[i] + yield ( + i, + { + "id": i, + "tokens": row["tokens"], + "labels": row["ner_tags"], + }, + ) diff --git a/seacrowd/sea_datasets/vlsp2016_sa/__init__.py b/seacrowd/sea_datasets/vlsp2016_sa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vlsp2016_sa/vlsp2016_sa.py b/seacrowd/sea_datasets/vlsp2016_sa/vlsp2016_sa.py new file mode 100644 index 000000000..903d68f90 --- /dev/null +++ b/seacrowd/sea_datasets/vlsp2016_sa/vlsp2016_sa.py @@ -0,0 +1,181 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{nguyen2018vlsp, + title={VLSP shared task: sentiment analysis}, + author={Nguyen, Huyen TM and Nguyen, Hung V and Ngo, \ +Quyen T and Vu, Luong X and Tran, Vu Mai and Ngo, Bach X and Le, Cuong A}, + journal={Journal of Computer Science and Cybernetics}, + volume={34}, + number={4}, + pages={295--310}, + year={2018} +} +""" +_DATASETNAME = "vlsp2016_sa" + +_DESCRIPTION = """\ +The SA-VLSP2016 dataset were collected from three source sites which are tinhte.vn, \ +vnexpress.net and Facebook, and used for the sentiment analysis task. The data consists \ +of comments of technical articles on those sites. Each comment is given one of \ +four labels: POS (positive), NEG (negative), NEU (neutral) and USELESS (filter-out). +""" + +_HOMEPAGE = "https://vlsp.org.vn/resources-vlsp2016" +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value +_LOCAL = True + +_URLS = {} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" +_TAGS = ["POS", "NEG", "NEU"] + + +class VLSP2016SADataset(datasets.GeneratorBasedBuilder): + """The SA-VLSP2016 dataset, used for sentiment analysis, comprises comments from technical \ + articles on tinhte.vn, vnexpress.net, and Facebook, each labeled as positive, negative, neutral, or filter-out.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "text" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_tokenized_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_tokenized", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + "label": datasets.ClassLabel(names=_TAGS), + } + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text_features(_TAGS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "SA2016-training_data"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "SA2016-TestData-Ans"), + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + if split == "dev": + if self.config.schema in ["source", f"seacrowd_{self.SEACROWD_SCHEMA_NAME}"]: + labelfile = "test_raw_ANS.txt" + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}_tokenized": + labelfile = "test_tokenized_ANS.txt" + + with open(os.path.join(filepath, labelfile)) as file: + data = file.read() + + pattern = re.compile("(?P.+)\n(?P