diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index 2c97055a5..8b4f91549 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -45,6 +45,9 @@ import {albanianTransforms} from './sq/albanian-transforms.js';
import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
import {tagalogTransforms} from './tl/tagalog-transforms.js';
import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
+import {convertFinalLetters, convertYiddishLigatures} from './yi/yiddish-text-postprocessors.js';
+import {combineYiddishLigatures, removeYiddishDiacritics} from './yi/yiddish-text-preprocessors.js';
+import {yiddishTransforms} from './yi/yiddish-transforms.js';
import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js';
const capitalizationPreprocessors = {
@@ -381,6 +384,21 @@ const languageDescriptors = [
normalizeDiacritics,
},
},
+ {
+ iso: 'yi',
+ iso639_3: 'yid',
+ name: 'Yiddish',
+ exampleText: 'באַשאַפֿן',
+ textPreprocessors: {
+ removeYiddishDiacritics,
+ combineYiddishLigatures,
+ },
+ textPostprocessors: {
+ convertFinalLetters,
+ convertYiddishLigatures,
+ },
+ languageTransforms: yiddishTransforms,
+ },
{
iso: 'yue',
iso639_3: 'yue',
diff --git a/ext/js/language/yi/yiddish-text-postprocessors.js b/ext/js/language/yi/yiddish-text-postprocessors.js
new file mode 100644
index 000000000..d082be7eb
--- /dev/null
+++ b/ext/js/language/yi/yiddish-text-postprocessors.js
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+
+const final_letter_map = new Map([
+ ['\u05de', '\u05dd'], // מ to ם
+ ['\u05e0', '\u05df'], // נ to ן
+ ['\u05e6', '\u05e5'], // צ to ץ
+ ['\u05e4', '\u05e3'], // פ to ף
+ ['\u05dB', '\u05da'], // כ to ך
+]);
+
+const ligatures = [
+ {lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו
+ {lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי
+ {lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי
+ {lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ
+ {lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ
+ {lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef
+ {lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef
+];
+
+/** @type {import('language').TextProcessor} */
+export const convertFinalLetters = {
+ name: 'Convert to Final Letters',
+ description: 'קויף → קויפֿ',
+ options: [true],
+ process: (str) => {
+ const len = str.length - 1;
+ if ([...final_letter_map.keys()].includes(str.charAt(len))) {
+ str = str.substring(0, len) + final_letter_map.get(str.substring(len));
+ }
+ return str;
+ },
+};
+
+/** @type {import('language').BidirectionalConversionPreprocessor} */
+export const convertYiddishLigatures = {
+ name: 'Split Ligatures',
+ description: 'וו → װ',
+ options: ['off', 'direct', 'inverse'],
+ process: (str, setting) => {
+ switch (setting) {
+ case 'off':
+ return str;
+ case 'direct':
+ for (const ligature of ligatures) {
+ str = str.replace(ligature.lig, ligature.split);
+ }
+ return str;
+ case 'inverse':
+ for (const ligature of ligatures) {
+ str = str.replace(ligature.split, ligature.lig);
+ }
+ return str;
+ }
+ },
+};
diff --git a/ext/js/language/yi/yiddish-text-preprocessors.js b/ext/js/language/yi/yiddish-text-preprocessors.js
new file mode 100644
index 000000000..8c9684d63
--- /dev/null
+++ b/ext/js/language/yi/yiddish-text-preprocessors.js
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+const ligatures = [
+ {lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו
+ {lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי
+ {lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי
+ {lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ
+ {lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ
+ {lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef
+ {lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef
+];
+
+/** @type {import('language').TextProcessor} */
+export const combineYiddishLigatures = {
+ name: 'Combine Ligatures',
+ description: 'וו → װ',
+ options: [true],
+ process: (str) => {
+ for (const ligature of ligatures) {
+ str = str.replace(ligature.split, ligature.lig);
+ }
+ return str;
+ },
+};
+
+/** @type {import('language').TextProcessor} */
+export const removeYiddishDiacritics = {
+ name: 'Remove Diacritics',
+ description: 'פאת → פֿאָתּ',
+ options: [true],
+ process: (str) => {
+ return str.replace(/[\u05B0-\u05C7]/g, '');
+ },
+};
diff --git a/ext/js/language/yi/yiddish-transforms.js b/ext/js/language/yi/yiddish-transforms.js
new file mode 100644
index 000000000..70f2b7136
--- /dev/null
+++ b/ext/js/language/yi/yiddish-transforms.js
@@ -0,0 +1,167 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+import {suffixInflection} from '../language-transforms.js';
+
+/** @typedef {keyof typeof conditions} Condition */
+
+const mutations = [
+ {new: '\u05e2', orig: '\ufb2e'}, // Ayin to pasekh alef
+ {new: '\u05e2', orig: '\ufb2f'}, // Ayin to komets alef
+ {new: '\u05e2', orig: '\u05D0'}, // Ayin to shumter alef
+ {new: '\u05f1', orig: '\u05e2'}, // Vov yud to ayin
+ {new: '\u05f2', orig: '\u05f1'}, // Tsvey yudn to Vov yud
+ {new: '\u05d9', orig: '\u05d5'}, // Yud to Vov
+];
+
+/**
+ * @param {string} inflectedSuffix
+ * @param {string} deinflectedSuffix
+ * @param {Condition[]} conditionsIn
+ * @param {Condition[]} conditionsOut
+ * @returns {import('language-transformer').SuffixRule[]}
+ */
+function umlautMutationSuffixInflection(inflectedSuffix, deinflectedSuffix, conditionsIn, conditionsOut) {
+ const suffixRegExp = new RegExp(inflectedSuffix + '$');
+ return mutations.map((mutation) => (
+ {
+ type: 'suffix',
+ isInflected: suffixRegExp,
+ deinflected: deinflectedSuffix,
+ deinflect: (/** @type {string} */ text) => {
+ const match = new RegExp(/[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F](?!.*[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F])/).exec(text.slice(0, -inflectedSuffix.length));
+ return (match?.[0] !== mutation.new) ? '' : text.slice(0, match.index) + mutation.orig + text.slice(match.index + 1, -inflectedSuffix.length) + deinflectedSuffix;
+ },
+ conditionsIn,
+ conditionsOut,
+ }
+ ));
+}
+
+const conditions = {
+ v: {
+ name: 'Verb',
+ isDictionaryForm: true,
+ subConditions: ['vpast', 'vpresent'],
+ },
+ vpast: {
+ name: 'Verb, past tense',
+ isDictionaryForm: false,
+ },
+ vpresent: {
+ name: 'Verb, present tense',
+ isDictionaryForm: true,
+ },
+ n: {
+ name: 'Noun',
+ isDictionaryForm: true,
+ subConditions: ['np', 'ns'],
+ },
+ np: {
+ name: 'Noun, plural',
+ isDictionaryForm: false,
+ },
+ ns: {
+ name: 'Noun, singular',
+ isDictionaryForm: true,
+ },
+ adj: {
+ name: 'Adjective',
+ isDictionaryForm: true,
+ },
+ adv: {
+ name: 'Adverb',
+ isDictionaryForm: true,
+ },
+};
+
+/** @type {import('language-transformer').LanguageTransformDescriptor} */
+export const yiddishTransforms = {
+ language: 'yi',
+ conditions,
+ transforms: {
+ plural: {
+ name: 'plural',
+ description: 'plural form of a noun',
+ rules: [
+ suffixInflection('\u05E1', '', ['np'], ['ns']), // -s
+ suffixInflection('\u05DF', '', ['np'], ['ns']), // -n
+ suffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im, hebrew
+ suffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er
+ suffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh
+ suffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en
+ suffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es
+ suffixInflection('\u05D5\u05EA', '', ['np'], ['ns']), // -ot, hebrew
+ suffixInflection('\u05E0\u05E1', '', ['np'], ['ns']), // -ns
+ suffixInflection('\u05E2\u05E8\u05E2\u05DF', '', ['np'], ['ns']), // -eren
+ suffixInflection('\u05E2\u05E0\u05E2\u05E1', '', ['np'], ['ns']), // -enes
+ suffixInflection('\u05E2\u05E0\u05E1', '', ['np'], ['ns']), // -ens
+ suffixInflection('\u05E2\u05E8\u05E1', '', ['np'], ['ns']), // -ers
+ suffixInflection('\u05E1\u05E2\u05E8', '', ['np'], ['ns']), // -ser
+ ],
+ },
+ umlaut_plural: {
+ name: 'umlaut_plural',
+ description: 'plural form of a umlaut noun',
+ rules: [
+ ...umlautMutationSuffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er
+ ...umlautMutationSuffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es
+ ...umlautMutationSuffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im
+ ...umlautMutationSuffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en
+ ...umlautMutationSuffixInflection('\u05DF', '', ['np'], ['ns']), // -n
+ ...umlautMutationSuffixInflection('\u05E1', '', ['np'], ['ns']), // -s
+ ...umlautMutationSuffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh
+ ...umlautMutationSuffixInflection('\u05E2\u05E8\u05E1', '', ['np'], ['ns']), // -ers
+ ],
+ },
+ diminutive: {
+ name: 'diminutive',
+ description: 'diminutive form of a noun',
+ rules: [
+ suffixInflection('\u05D8\u05E9\u05D9\u05E7', '', ['n'], ['n']), // -tshik
+ suffixInflection('\u05E7\u05E2', '', ['n'], ['n']), // -ke
+ suffixInflection('\u05DC', '', ['n'], ['n']), // -l
+ suffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele
+ ],
+ },
+ diminutive_and_umlaut: {
+ name: 'diminutive_and_umlaut',
+ description: 'diminutive form of a noun with stem umlaut',
+ rules: [
+ ...umlautMutationSuffixInflection('\u05DC', '', ['n'], ['n']), // -l
+ ...umlautMutationSuffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele
+ ],
+ },
+ verb_present_singular_to_first_person: {
+ name: 'verb_present_singular_to_first_person',
+ description: 'Turn the second and third person singular form to first person',
+ rules: [
+ suffixInflection('\u05E1\u05D8', '', ['v'], ['vpresent']), // -st
+ suffixInflection('\u05D8', '', ['v'], ['vpresent']), // -t
+ suffixInflection('\u05E0\u05D3\u05D9\u05E7', '', ['v'], ['vpresent']), // -ndik
+ ],
+ },
+ verb_present_plural_to_first_person: {
+ name: 'verb_present_plural_to_first_person',
+ description: 'Turn the second plural form to first person plural form',
+ rules: [
+ suffixInflection('\u05D8\u05E1', '\u05E0', ['v'], ['vpresent']), // -ts
+ suffixInflection('\u05D8', '\u05E0', ['v'], ['vpresent']), // -t
+ ],
+ },
+ },
+};
diff --git a/test/language/yiddish-transforms.test.js b/test/language/yiddish-transforms.test.js
new file mode 100644
index 000000000..f2372d60d
--- /dev/null
+++ b/test/language/yiddish-transforms.test.js
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2023-2024 Yomitan Authors
+ * Copyright (C) 2020-2022 Yomichan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+import {LanguageTransformer} from '../../ext/js/language/language-transformer.js';
+import {yiddishTransforms} from '../../ext/js/language/yi/yiddish-transforms.js';
+import {testLanguageTransformer} from '../fixtures/language-transformer-test.js';
+
+/* Since Yiddish final letters are handled in a text postprocessor after all the transformations have been run, test case terms must never use the final form of a letter!
+Otherwise, it will fail even if the rule is correct! Sources require use of final letters however for plural deinflection */
+const tests = [
+ {
+ category: 'nouns',
+ valid: true,
+ tests: [
+ {term: 'גרופּע', source: 'גרופּעס', rule: 'ns', reasons: ['plural']}, // grupes -> gupe
+ {term: 'טיש', source: 'טישן', rule: 'ns', reasons: ['plural']}, // tishn -> tish
+ {term: 'פּויער', source: 'פּויערים', rule: 'ns', reasons: ['plural']}, // poyerim -> poyer
+ {term: 'קינד', source: 'קינדער', rule: 'ns', reasons: ['plural']}, // kinder -> kind
+ {term: 'בענקל', source: 'בענקלעך', rule: 'ns', reasons: ['plural']}, // benklekh -> benkl
+ {term: 'באַנ', source: 'באַנען', rule: 'ns', reasons: ['plural']}, // banen -> ban
+ {term: 'נודניק', source: 'נודניקעס', rule: 'ns', reasons: ['plural']}, // nudnikes -> nudnik
+ {term: 'חלומ', source: 'חלומות', rule: 'ns', reasons: ['plural']}, // khlomos -> khlom
+ {term: 'עטיקעט', source: 'עטיקעטקע', rule: 'n', reasons: ['diminutive']}, // etiketke -> etiket
+ {term: 'קליענטעל', source: 'קליענטעלטשיק', rule: 'n', reasons: ['diminutive']}, // klienteltshik -> klientel
+ {term: 'קינדער', source: 'קינדערלעך', rule: 'ns', reasons: ['diminutive', 'plural']}, // kinderlekh -> kinder
+ {term: 'ליפ', source: 'ליפענעס', rule: 'ns', reasons: ['plural']}, // lipenes -> lip
+ {term: 'אײ', source: 'אײערען', rule: 'ns', reasons: ['plural']}, // eyeren -> ey
+ {term: 'זאָק', source: 'זאָקענס', rule: 'ns', reasons: ['plural']}, // zokens -> zok
+ {term: 'בוך', source: 'בוךערס', rule: 'ns', reasons: ['plural']}, // bukhers -> bukh
+ {term: 'קוכן', source: 'קוכןסער', rule: 'ns', reasons: ['plural']}, // kukhnser -> kukh
+ {term: 'קעניג', source: 'קעניגנס', rule: 'ns', reasons: ['plural']}, // kenigns -> kenig
+ ],
+ },
+ {
+ category: 'umlaut_nouns',
+ valid: true,
+ tests: [
+ {term: 'מאנ', source: 'מענער', rule: 'ns', reasons: ['umlaut_plural']}, // mener -> man
+ {term: 'טשוואק', source: 'טשוועקעס', rule: 'ns', reasons: ['umlaut_plural']}, // tshvekes -> tshvak
+ {term: 'מױד', source: 'מײדלעך', rule: 'ns', reasons: ['diminutive_and_umlaut', 'plural']}, // meydlekh -> moyd
+ {term: 'דאָקטער', source: 'דאָקטױרים', rule: 'ns', reasons: ['umlaut_plural']}, // doktoyrim -> dokter
+ {term: 'בלומ', source: 'בלימען', rule: 'ns', reasons: ['umlaut_plural']}, // blimen -> blum
+ {term: 'אומשטאנד', source: 'אומשטענדן', rule: 'ns', reasons: ['umlaut_plural']}, // umshtendn -> umshtand
+ {term: 'קאצ', source: 'קעצעלע', rule: 'n', reasons: ['diminutive_and_umlaut']}, // ketzele -> katz
+ {term: 'קאצ', source: 'קעצל', rule: 'n', reasons: ['diminutive_and_umlaut']}, // ketzl -> katz
+ {term: 'באַרג', source: 'בערגן', rule: 'ns', reasons: ['umlaut_plural']}, // bergn -> barg
+ {term: 'בױמ', source: 'בײמערס', rule: 'ns', reasons: ['umlaut_plural']}, // beymers -> boim
+ {term: 'קאפּ', source: 'קעפּער', rule: 'ns', reasons: ['umlaut_plural']}, // keper -> kop
+ {term: 'קאפּ', source: 'קעפּער', rule: 'ns', reasons: ['umlaut_plural']}, // kep -> kop
+ ],
+ },
+ {
+ category: 'verbs',
+ valid: true,
+ tests: [
+ {term: 'קויפֿ', source: 'קויפֿסט', rule: 'v', reasons: ['verb_present_singular_to_first_person']},
+ {term: 'קויפֿ', source: 'קויפֿט', rule: 'vpresent', reasons: ['verb_present_singular_to_first_person']},
+ {term: 'קויפֿנ', source: 'קויפֿט', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']},
+ {term: 'קויפֿנ', source: 'קויפֿטס', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']},
+ {term: 'קויפֿנ', source: 'קויפֿטס', rule: 'vpresent', reasons: ['verb_present_plural_to_first_person']},
+ ],
+ },
+];
+
+
+const languageTransformer = new LanguageTransformer();
+languageTransformer.addDescriptor(yiddishTransforms);
+testLanguageTransformer(languageTransformer, tests);
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index f48e1e6d5..c5e09ae42 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -204,6 +204,16 @@ type AllTextProcessors = {
normalizeDiacritics: TextProcessor<'old' | 'new' | 'off'>;
};
};
+ yi: {
+ pre: {
+ combineYiddishLigatures: TextProcessor;
+ removeYiddishDiacritics: TextProcessor;
+ };
+ post: {
+ convertFinalLetters: TextProcessor;
+ convertYiddishLigatures: BidirectionalConversionPreprocessor;
+ };
+ };
yue: {
pre: {
normalizeRadicalCharacters: TextProcessor;