Skip to content

Commit

Permalink
Add Yiddish support to Yomitan (#1567)
Browse files Browse the repository at this point in the history
* [yi] Initial commit

* [yi] Add umlaut demutation for diminutives and plurals

* [yi] Add verb mutation code

* [yi] Add missing plural forms, separate dimimutive from diminutive with umlaut

* [yi] Modify umlaut demutation to allow demutation of ayin to pasekh alef and komets alef, and demutation of vov yud to ayin

* [yi] Add missing plural forms from Israeli-Haredi yiddish

---------

Signed-off-by: ThatsItForTheOtherOne <[email protected]>
  • Loading branch information
ThatsItForTheOtherOne authored Dec 31, 2024
1 parent 8f4cf51 commit 9b281df
Show file tree
Hide file tree
Showing 6 changed files with 399 additions and 0 deletions.
18 changes: 18 additions & 0 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ import {albanianTransforms} from './sq/albanian-transforms.js';
import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
import {tagalogTransforms} from './tl/tagalog-transforms.js';
import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
import {convertFinalLetters, convertYiddishLigatures} from './yi/yiddish-text-postprocessors.js';
import {combineYiddishLigatures, removeYiddishDiacritics} from './yi/yiddish-text-preprocessors.js';
import {yiddishTransforms} from './yi/yiddish-transforms.js';
import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js';

const capitalizationPreprocessors = {
Expand Down Expand Up @@ -381,6 +384,21 @@ const languageDescriptors = [
normalizeDiacritics,
},
},
{
iso: 'yi',
iso639_3: 'yid',
name: 'Yiddish',
exampleText: 'באַשאַפֿן',
textPreprocessors: {
removeYiddishDiacritics,
combineYiddishLigatures,
},
textPostprocessors: {
convertFinalLetters,
convertYiddishLigatures,
},
languageTransforms: yiddishTransforms,
},
{
iso: 'yue',
iso639_3: 'yue',
Expand Down
72 changes: 72 additions & 0 deletions ext/js/language/yi/yiddish-text-postprocessors.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright (C) 2024 Yomitan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/


const final_letter_map = new Map([
['\u05de', '\u05dd'], // מ to ם
['\u05e0', '\u05df'], // נ to ן
['\u05e6', '\u05e5'], // צ to ץ
['\u05e4', '\u05e3'], // פ to ף
['\u05dB', '\u05da'], // כ to ך
]);

const ligatures = [
{lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו
{lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי
{lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי
{lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ
{lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ
{lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef
{lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef
];

/** @type {import('language').TextProcessor<boolean>} */
export const convertFinalLetters = {
name: 'Convert to Final Letters',
description: 'קויף → קויפֿ',
options: [true],
process: (str) => {
const len = str.length - 1;
if ([...final_letter_map.keys()].includes(str.charAt(len))) {
str = str.substring(0, len) + final_letter_map.get(str.substring(len));
}
return str;
},
};

/** @type {import('language').BidirectionalConversionPreprocessor} */
export const convertYiddishLigatures = {
name: 'Split Ligatures',
description: 'וו → װ',
options: ['off', 'direct', 'inverse'],
process: (str, setting) => {
switch (setting) {
case 'off':
return str;
case 'direct':
for (const ligature of ligatures) {
str = str.replace(ligature.lig, ligature.split);
}
return str;
case 'inverse':
for (const ligature of ligatures) {
str = str.replace(ligature.split, ligature.lig);
}
return str;
}
},
};
49 changes: 49 additions & 0 deletions ext/js/language/yi/yiddish-text-preprocessors.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (C) 2024 Yomitan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

const ligatures = [
{lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו
{lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי
{lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי
{lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ
{lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ
{lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef
{lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef
];

/** @type {import('language').TextProcessor<boolean>} */
export const combineYiddishLigatures = {
name: 'Combine Ligatures',
description: 'וו → װ',
options: [true],
process: (str) => {
for (const ligature of ligatures) {
str = str.replace(ligature.split, ligature.lig);
}
return str;
},
};

/** @type {import('language').TextProcessor<boolean>} */
export const removeYiddishDiacritics = {
name: 'Remove Diacritics',
description: 'פאת → פֿאָתּ',
options: [true],
process: (str) => {
return str.replace(/[\u05B0-\u05C7]/g, '');
},
};
167 changes: 167 additions & 0 deletions ext/js/language/yi/yiddish-transforms.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/*
* Copyright (C) 2024 Yomitan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import {suffixInflection} from '../language-transforms.js';

/** @typedef {keyof typeof conditions} Condition */

const mutations = [
{new: '\u05e2', orig: '\ufb2e'}, // Ayin to pasekh alef
{new: '\u05e2', orig: '\ufb2f'}, // Ayin to komets alef
{new: '\u05e2', orig: '\u05D0'}, // Ayin to shumter alef
{new: '\u05f1', orig: '\u05e2'}, // Vov yud to ayin
{new: '\u05f2', orig: '\u05f1'}, // Tsvey yudn to Vov yud
{new: '\u05d9', orig: '\u05d5'}, // Yud to Vov
];

/**
* @param {string} inflectedSuffix
* @param {string} deinflectedSuffix
* @param {Condition[]} conditionsIn
* @param {Condition[]} conditionsOut
* @returns {import('language-transformer').SuffixRule<Condition>[]}
*/
function umlautMutationSuffixInflection(inflectedSuffix, deinflectedSuffix, conditionsIn, conditionsOut) {
const suffixRegExp = new RegExp(inflectedSuffix + '$');
return mutations.map((mutation) => (
{
type: 'suffix',
isInflected: suffixRegExp,
deinflected: deinflectedSuffix,
deinflect: (/** @type {string} */ text) => {
const match = new RegExp(/[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F](?!.*[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F])/).exec(text.slice(0, -inflectedSuffix.length));
return (match?.[0] !== mutation.new) ? '' : text.slice(0, match.index) + mutation.orig + text.slice(match.index + 1, -inflectedSuffix.length) + deinflectedSuffix;
},
conditionsIn,
conditionsOut,
}
));
}

const conditions = {
v: {
name: 'Verb',
isDictionaryForm: true,
subConditions: ['vpast', 'vpresent'],
},
vpast: {
name: 'Verb, past tense',
isDictionaryForm: false,
},
vpresent: {
name: 'Verb, present tense',
isDictionaryForm: true,
},
n: {
name: 'Noun',
isDictionaryForm: true,
subConditions: ['np', 'ns'],
},
np: {
name: 'Noun, plural',
isDictionaryForm: false,
},
ns: {
name: 'Noun, singular',
isDictionaryForm: true,
},
adj: {
name: 'Adjective',
isDictionaryForm: true,
},
adv: {
name: 'Adverb',
isDictionaryForm: true,
},
};

/** @type {import('language-transformer').LanguageTransformDescriptor<Condition>} */
export const yiddishTransforms = {
language: 'yi',
conditions,
transforms: {
plural: {
name: 'plural',
description: 'plural form of a noun',
rules: [
suffixInflection('\u05E1', '', ['np'], ['ns']), // -s
suffixInflection('\u05DF', '', ['np'], ['ns']), // -n
suffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im, hebrew
suffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er
suffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh
suffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en
suffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es
suffixInflection('\u05D5\u05EA', '', ['np'], ['ns']), // -ot, hebrew
suffixInflection('\u05E0\u05E1', '', ['np'], ['ns']), // -ns
suffixInflection('\u05E2\u05E8\u05E2\u05DF', '', ['np'], ['ns']), // -eren
suffixInflection('\u05E2\u05E0\u05E2\u05E1', '', ['np'], ['ns']), // -enes
suffixInflection('\u05E2\u05E0\u05E1', '', ['np'], ['ns']), // -ens
suffixInflection('\u05E2\u05E8\u05E1', '', ['np'], ['ns']), // -ers
suffixInflection('\u05E1\u05E2\u05E8', '', ['np'], ['ns']), // -ser
],
},
umlaut_plural: {
name: 'umlaut_plural',
description: 'plural form of a umlaut noun',
rules: [
...umlautMutationSuffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er
...umlautMutationSuffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es
...umlautMutationSuffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im
...umlautMutationSuffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en
...umlautMutationSuffixInflection('\u05DF', '', ['np'], ['ns']), // -n
...umlautMutationSuffixInflection('\u05E1', '', ['np'], ['ns']), // -s
...umlautMutationSuffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh
...umlautMutationSuffixInflection('\u05E2\u05E8\u05E1', '', ['np'], ['ns']), // -ers
],
},
diminutive: {
name: 'diminutive',
description: 'diminutive form of a noun',
rules: [
suffixInflection('\u05D8\u05E9\u05D9\u05E7', '', ['n'], ['n']), // -tshik
suffixInflection('\u05E7\u05E2', '', ['n'], ['n']), // -ke
suffixInflection('\u05DC', '', ['n'], ['n']), // -l
suffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele
],
},
diminutive_and_umlaut: {
name: 'diminutive_and_umlaut',
description: 'diminutive form of a noun with stem umlaut',
rules: [
...umlautMutationSuffixInflection('\u05DC', '', ['n'], ['n']), // -l
...umlautMutationSuffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele
],
},
verb_present_singular_to_first_person: {
name: 'verb_present_singular_to_first_person',
description: 'Turn the second and third person singular form to first person',
rules: [
suffixInflection('\u05E1\u05D8', '', ['v'], ['vpresent']), // -st
suffixInflection('\u05D8', '', ['v'], ['vpresent']), // -t
suffixInflection('\u05E0\u05D3\u05D9\u05E7', '', ['v'], ['vpresent']), // -ndik
],
},
verb_present_plural_to_first_person: {
name: 'verb_present_plural_to_first_person',
description: 'Turn the second plural form to first person plural form',
rules: [
suffixInflection('\u05D8\u05E1', '\u05E0', ['v'], ['vpresent']), // -ts
suffixInflection('\u05D8', '\u05E0', ['v'], ['vpresent']), // -t
],
},
},
};
Loading

0 comments on commit 9b281df

Please sign in to comment.