diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 2c97055a5..8b4f91549 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -45,6 +45,9 @@ import {albanianTransforms} from './sq/albanian-transforms.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; import {tagalogTransforms} from './tl/tagalog-transforms.js'; import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; +import {convertFinalLetters, convertYiddishLigatures} from './yi/yiddish-text-postprocessors.js'; +import {combineYiddishLigatures, removeYiddishDiacritics} from './yi/yiddish-text-preprocessors.js'; +import {yiddishTransforms} from './yi/yiddish-transforms.js'; import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js'; const capitalizationPreprocessors = { @@ -381,6 +384,21 @@ const languageDescriptors = [ normalizeDiacritics, }, }, + { + iso: 'yi', + iso639_3: 'yid', + name: 'Yiddish', + exampleText: 'באַשאַפֿן', + textPreprocessors: { + removeYiddishDiacritics, + combineYiddishLigatures, + }, + textPostprocessors: { + convertFinalLetters, + convertYiddishLigatures, + }, + languageTransforms: yiddishTransforms, + }, { iso: 'yue', iso639_3: 'yue', diff --git a/ext/js/language/yi/yiddish-text-postprocessors.js b/ext/js/language/yi/yiddish-text-postprocessors.js new file mode 100644 index 000000000..d082be7eb --- /dev/null +++ b/ext/js/language/yi/yiddish-text-postprocessors.js @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +const final_letter_map = new Map([ + ['\u05de', '\u05dd'], // מ to ם + ['\u05e0', '\u05df'], // נ to ן + ['\u05e6', '\u05e5'], // צ to ץ + ['\u05e4', '\u05e3'], // פ to ף + ['\u05dB', '\u05da'], // כ to ך +]); + +const ligatures = [ + {lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו + {lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי + {lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי + {lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ + {lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ + {lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef + {lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef +]; + +/** @type {import('language').TextProcessor} */ +export const convertFinalLetters = { + name: 'Convert to Final Letters', + description: 'קויף → קויפֿ', + options: [true], + process: (str) => { + const len = str.length - 1; + if ([...final_letter_map.keys()].includes(str.charAt(len))) { + str = str.substring(0, len) + final_letter_map.get(str.substring(len)); + } + return str; + }, +}; + +/** @type {import('language').BidirectionalConversionPreprocessor} */ +export const convertYiddishLigatures = { + name: 'Split Ligatures', + description: 'וו → װ', + options: ['off', 'direct', 'inverse'], + process: (str, setting) => { + switch (setting) { + case 'off': + return str; + case 'direct': + for (const ligature of ligatures) { + str = str.replace(ligature.lig, ligature.split); + } + return str; + case 'inverse': + for (const ligature of ligatures) { + str = str.replace(ligature.split, ligature.lig); + } + return str; + } + }, +}; diff --git a/ext/js/language/yi/yiddish-text-preprocessors.js b/ext/js/language/yi/yiddish-text-preprocessors.js new file mode 100644 index 000000000..8c9684d63 --- /dev/null +++ b/ext/js/language/yi/yiddish-text-preprocessors.js @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +const ligatures = [ + {lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו + {lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי + {lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי + {lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ + {lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ + {lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef + {lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef +]; + +/** @type {import('language').TextProcessor} */ +export const combineYiddishLigatures = { + name: 'Combine Ligatures', + description: 'וו → װ', + options: [true], + process: (str) => { + for (const ligature of ligatures) { + str = str.replace(ligature.split, ligature.lig); + } + return str; + }, +}; + +/** @type {import('language').TextProcessor} */ +export const removeYiddishDiacritics = { + name: 'Remove Diacritics', + description: 'פאת → פֿאָתּ', + options: [true], + process: (str) => { + return str.replace(/[\u05B0-\u05C7]/g, ''); + }, +}; diff --git a/ext/js/language/yi/yiddish-transforms.js b/ext/js/language/yi/yiddish-transforms.js new file mode 100644 index 000000000..70f2b7136 --- /dev/null +++ b/ext/js/language/yi/yiddish-transforms.js @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {suffixInflection} from '../language-transforms.js'; + +/** @typedef {keyof typeof conditions} Condition */ + +const mutations = [ + {new: '\u05e2', orig: '\ufb2e'}, // Ayin to pasekh alef + {new: '\u05e2', orig: '\ufb2f'}, // Ayin to komets alef + {new: '\u05e2', orig: '\u05D0'}, // Ayin to shumter alef + {new: '\u05f1', orig: '\u05e2'}, // Vov yud to ayin + {new: '\u05f2', orig: '\u05f1'}, // Tsvey yudn to Vov yud + {new: '\u05d9', orig: '\u05d5'}, // Yud to Vov +]; + +/** + * @param {string} inflectedSuffix + * @param {string} deinflectedSuffix + * @param {Condition[]} conditionsIn + * @param {Condition[]} conditionsOut + * @returns {import('language-transformer').SuffixRule[]} + */ +function umlautMutationSuffixInflection(inflectedSuffix, deinflectedSuffix, conditionsIn, conditionsOut) { + const suffixRegExp = new RegExp(inflectedSuffix + '$'); + return mutations.map((mutation) => ( + { + type: 'suffix', + isInflected: suffixRegExp, + deinflected: deinflectedSuffix, + deinflect: (/** @type {string} */ text) => { + const match = new RegExp(/[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F](?!.*[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F])/).exec(text.slice(0, -inflectedSuffix.length)); + return (match?.[0] !== mutation.new) ? '' : text.slice(0, match.index) + mutation.orig + text.slice(match.index + 1, -inflectedSuffix.length) + deinflectedSuffix; + }, + conditionsIn, + conditionsOut, + } + )); +} + +const conditions = { + v: { + name: 'Verb', + isDictionaryForm: true, + subConditions: ['vpast', 'vpresent'], + }, + vpast: { + name: 'Verb, past tense', + isDictionaryForm: false, + }, + vpresent: { + name: 'Verb, present tense', + isDictionaryForm: true, + }, + n: { + name: 'Noun', + isDictionaryForm: true, + subConditions: ['np', 'ns'], + }, + np: { + name: 'Noun, plural', + isDictionaryForm: false, + }, + ns: { + name: 'Noun, singular', + isDictionaryForm: true, + }, + adj: { + name: 'Adjective', + isDictionaryForm: true, + }, + adv: { + name: 'Adverb', + isDictionaryForm: true, + }, +}; + +/** @type {import('language-transformer').LanguageTransformDescriptor} */ +export const yiddishTransforms = { + language: 'yi', + conditions, + transforms: { + plural: { + name: 'plural', + description: 'plural form of a noun', + rules: [ + suffixInflection('\u05E1', '', ['np'], ['ns']), // -s + suffixInflection('\u05DF', '', ['np'], ['ns']), // -n + suffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im, hebrew + suffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er + suffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh + suffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en + suffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es + suffixInflection('\u05D5\u05EA', '', ['np'], ['ns']), // -ot, hebrew + suffixInflection('\u05E0\u05E1', '', ['np'], ['ns']), // -ns + suffixInflection('\u05E2\u05E8\u05E2\u05DF', '', ['np'], ['ns']), // -eren + suffixInflection('\u05E2\u05E0\u05E2\u05E1', '', ['np'], ['ns']), // -enes + suffixInflection('\u05E2\u05E0\u05E1', '', ['np'], ['ns']), // -ens + suffixInflection('\u05E2\u05E8\u05E1', '', ['np'], ['ns']), // -ers + suffixInflection('\u05E1\u05E2\u05E8', '', ['np'], ['ns']), // -ser + ], + }, + umlaut_plural: { + name: 'umlaut_plural', + description: 'plural form of a umlaut noun', + rules: [ + ...umlautMutationSuffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er + ...umlautMutationSuffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es + ...umlautMutationSuffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im + ...umlautMutationSuffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en + ...umlautMutationSuffixInflection('\u05DF', '', ['np'], ['ns']), // -n + ...umlautMutationSuffixInflection('\u05E1', '', ['np'], ['ns']), // -s + ...umlautMutationSuffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh + ...umlautMutationSuffixInflection('\u05E2\u05E8\u05E1', '', ['np'], ['ns']), // -ers + ], + }, + diminutive: { + name: 'diminutive', + description: 'diminutive form of a noun', + rules: [ + suffixInflection('\u05D8\u05E9\u05D9\u05E7', '', ['n'], ['n']), // -tshik + suffixInflection('\u05E7\u05E2', '', ['n'], ['n']), // -ke + suffixInflection('\u05DC', '', ['n'], ['n']), // -l + suffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele + ], + }, + diminutive_and_umlaut: { + name: 'diminutive_and_umlaut', + description: 'diminutive form of a noun with stem umlaut', + rules: [ + ...umlautMutationSuffixInflection('\u05DC', '', ['n'], ['n']), // -l + ...umlautMutationSuffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele + ], + }, + verb_present_singular_to_first_person: { + name: 'verb_present_singular_to_first_person', + description: 'Turn the second and third person singular form to first person', + rules: [ + suffixInflection('\u05E1\u05D8', '', ['v'], ['vpresent']), // -st + suffixInflection('\u05D8', '', ['v'], ['vpresent']), // -t + suffixInflection('\u05E0\u05D3\u05D9\u05E7', '', ['v'], ['vpresent']), // -ndik + ], + }, + verb_present_plural_to_first_person: { + name: 'verb_present_plural_to_first_person', + description: 'Turn the second plural form to first person plural form', + rules: [ + suffixInflection('\u05D8\u05E1', '\u05E0', ['v'], ['vpresent']), // -ts + suffixInflection('\u05D8', '\u05E0', ['v'], ['vpresent']), // -t + ], + }, + }, +}; diff --git a/test/language/yiddish-transforms.test.js b/test/language/yiddish-transforms.test.js new file mode 100644 index 000000000..f2372d60d --- /dev/null +++ b/test/language/yiddish-transforms.test.js @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2023-2024 Yomitan Authors + * Copyright (C) 2020-2022 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {LanguageTransformer} from '../../ext/js/language/language-transformer.js'; +import {yiddishTransforms} from '../../ext/js/language/yi/yiddish-transforms.js'; +import {testLanguageTransformer} from '../fixtures/language-transformer-test.js'; + +/* Since Yiddish final letters are handled in a text postprocessor after all the transformations have been run, test case terms must never use the final form of a letter! +Otherwise, it will fail even if the rule is correct! Sources require use of final letters however for plural deinflection */ +const tests = [ + { + category: 'nouns', + valid: true, + tests: [ + {term: 'גרופּע', source: 'גרופּעס', rule: 'ns', reasons: ['plural']}, // grupes -> gupe + {term: 'טיש', source: 'טישן', rule: 'ns', reasons: ['plural']}, // tishn -> tish + {term: 'פּויער', source: 'פּויערים', rule: 'ns', reasons: ['plural']}, // poyerim -> poyer + {term: 'קינד', source: 'קינדער', rule: 'ns', reasons: ['plural']}, // kinder -> kind + {term: 'בענקל', source: 'בענקלעך', rule: 'ns', reasons: ['plural']}, // benklekh -> benkl + {term: 'באַנ', source: 'באַנען', rule: 'ns', reasons: ['plural']}, // banen -> ban + {term: 'נודניק', source: 'נודניקעס', rule: 'ns', reasons: ['plural']}, // nudnikes -> nudnik + {term: 'חלומ', source: 'חלומות', rule: 'ns', reasons: ['plural']}, // khlomos -> khlom + {term: 'עטיקעט', source: 'עטיקעטקע', rule: 'n', reasons: ['diminutive']}, // etiketke -> etiket + {term: 'קליענטעל', source: 'קליענטעלטשיק', rule: 'n', reasons: ['diminutive']}, // klienteltshik -> klientel + {term: 'קינדער', source: 'קינדערלעך', rule: 'ns', reasons: ['diminutive', 'plural']}, // kinderlekh -> kinder + {term: 'ליפ', source: 'ליפענעס', rule: 'ns', reasons: ['plural']}, // lipenes -> lip + {term: 'אײ', source: 'אײערען', rule: 'ns', reasons: ['plural']}, // eyeren -> ey + {term: 'זאָק', source: 'זאָקענס', rule: 'ns', reasons: ['plural']}, // zokens -> zok + {term: 'בוך', source: 'בוךערס', rule: 'ns', reasons: ['plural']}, // bukhers -> bukh + {term: 'קוכן', source: 'קוכןסער', rule: 'ns', reasons: ['plural']}, // kukhnser -> kukh + {term: 'קעניג', source: 'קעניגנס', rule: 'ns', reasons: ['plural']}, // kenigns -> kenig + ], + }, + { + category: 'umlaut_nouns', + valid: true, + tests: [ + {term: 'מאנ', source: 'מענער', rule: 'ns', reasons: ['umlaut_plural']}, // mener -> man + {term: 'טשוואק', source: 'טשוועקעס', rule: 'ns', reasons: ['umlaut_plural']}, // tshvekes -> tshvak + {term: 'מױד', source: 'מײדלעך', rule: 'ns', reasons: ['diminutive_and_umlaut', 'plural']}, // meydlekh -> moyd + {term: 'דאָקטער', source: 'דאָקטױרים', rule: 'ns', reasons: ['umlaut_plural']}, // doktoyrim -> dokter + {term: 'בלומ', source: 'בלימען', rule: 'ns', reasons: ['umlaut_plural']}, // blimen -> blum + {term: 'אומשטאנד', source: 'אומשטענדן', rule: 'ns', reasons: ['umlaut_plural']}, // umshtendn -> umshtand + {term: 'קאצ', source: 'קעצעלע', rule: 'n', reasons: ['diminutive_and_umlaut']}, // ketzele -> katz + {term: 'קאצ', source: 'קעצל', rule: 'n', reasons: ['diminutive_and_umlaut']}, // ketzl -> katz + {term: 'באַרג', source: 'בערגן', rule: 'ns', reasons: ['umlaut_plural']}, // bergn -> barg + {term: 'בױמ', source: 'בײמערס', rule: 'ns', reasons: ['umlaut_plural']}, // beymers -> boim + {term: 'קאפּ', source: 'קעפּער', rule: 'ns', reasons: ['umlaut_plural']}, // keper -> kop + {term: 'קאפּ', source: 'קעפּער', rule: 'ns', reasons: ['umlaut_plural']}, // kep -> kop + ], + }, + { + category: 'verbs', + valid: true, + tests: [ + {term: 'קויפֿ', source: 'קויפֿסט', rule: 'v', reasons: ['verb_present_singular_to_first_person']}, + {term: 'קויפֿ', source: 'קויפֿט', rule: 'vpresent', reasons: ['verb_present_singular_to_first_person']}, + {term: 'קויפֿנ', source: 'קויפֿט', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']}, + {term: 'קויפֿנ', source: 'קויפֿטס', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']}, + {term: 'קויפֿנ', source: 'קויפֿטס', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']}, + ], + }, +]; + + +const languageTransformer = new LanguageTransformer(); +languageTransformer.addDescriptor(yiddishTransforms); +testLanguageTransformer(languageTransformer, tests); diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index f48e1e6d5..c5e09ae42 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -204,6 +204,16 @@ type AllTextProcessors = { normalizeDiacritics: TextProcessor<'old' | 'new' | 'off'>; }; }; + yi: { + pre: { + combineYiddishLigatures: TextProcessor; + removeYiddishDiacritics: TextProcessor; + }; + post: { + convertFinalLetters: TextProcessor; + convertYiddishLigatures: BidirectionalConversionPreprocessor; + }; + }; yue: { pre: { normalizeRadicalCharacters: TextProcessor;