forked from FooSoft/yomichan
-
Notifications
You must be signed in to change notification settings - Fork 116
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Yiddish support to Yomitan (#1567)
* [yi] Initial commit * [yi] Add umlaut demutation for diminutives and plurals * [yi] Add verb mutation code * [yi] Add missing plural forms, separate dimimutive from diminutive with umlaut * [yi] Modify umlaut demutation to allow demutation of ayin to pasekh alef and komets alef, and demutation of vov yud to ayin * [yi] Add missing plural forms from Israeli-Haredi yiddish --------- Signed-off-by: ThatsItForTheOtherOne <[email protected]>
- Loading branch information
1 parent
8f4cf51
commit 9b281df
Showing
6 changed files
with
399 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
/* | ||
* Copyright (C) 2024 Yomitan Authors | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
|
||
const final_letter_map = new Map([ | ||
['\u05de', '\u05dd'], // מ to ם | ||
['\u05e0', '\u05df'], // נ to ן | ||
['\u05e6', '\u05e5'], // צ to ץ | ||
['\u05e4', '\u05e3'], // פ to ף | ||
['\u05dB', '\u05da'], // כ to ך | ||
]); | ||
|
||
const ligatures = [ | ||
{lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו | ||
{lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי | ||
{lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי | ||
{lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ | ||
{lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ | ||
{lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef | ||
{lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef | ||
]; | ||
|
||
/** @type {import('language').TextProcessor<boolean>} */ | ||
export const convertFinalLetters = { | ||
name: 'Convert to Final Letters', | ||
description: 'קויף → קויפֿ', | ||
options: [true], | ||
process: (str) => { | ||
const len = str.length - 1; | ||
if ([...final_letter_map.keys()].includes(str.charAt(len))) { | ||
str = str.substring(0, len) + final_letter_map.get(str.substring(len)); | ||
} | ||
return str; | ||
}, | ||
}; | ||
|
||
/** @type {import('language').BidirectionalConversionPreprocessor} */ | ||
export const convertYiddishLigatures = { | ||
name: 'Split Ligatures', | ||
description: 'וו → װ', | ||
options: ['off', 'direct', 'inverse'], | ||
process: (str, setting) => { | ||
switch (setting) { | ||
case 'off': | ||
return str; | ||
case 'direct': | ||
for (const ligature of ligatures) { | ||
str = str.replace(ligature.lig, ligature.split); | ||
} | ||
return str; | ||
case 'inverse': | ||
for (const ligature of ligatures) { | ||
str = str.replace(ligature.split, ligature.lig); | ||
} | ||
return str; | ||
} | ||
}, | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/* | ||
* Copyright (C) 2024 Yomitan Authors | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
const ligatures = [ | ||
{lig: '\u05f0', split: '\u05d5' + '\u05d5'}, // װ -> וו | ||
{lig: '\u05f1', split: '\u05d5' + '\u05d9'}, // ױ -> וי | ||
{lig: '\u05f2', split: '\u05d9' + '\u05d9'}, // ײ -> יי | ||
{lig: '\ufb1d', split: '\u05d9' + '\u05b4'}, // יִ -> יִ | ||
{lig: '\ufb1f', split: '\u05d9' + '\u05d9' + '\u05b7'}, // ײַ -> ייַ | ||
{lig: '\ufb2e', split: '\u05d0' + '\u05b7'}, // Pasekh alef | ||
{lig: '\ufb2f', split: '\u05d0' + '\u05b8'}, // Komets alef | ||
]; | ||
|
||
/** @type {import('language').TextProcessor<boolean>} */ | ||
export const combineYiddishLigatures = { | ||
name: 'Combine Ligatures', | ||
description: 'וו → װ', | ||
options: [true], | ||
process: (str) => { | ||
for (const ligature of ligatures) { | ||
str = str.replace(ligature.split, ligature.lig); | ||
} | ||
return str; | ||
}, | ||
}; | ||
|
||
/** @type {import('language').TextProcessor<boolean>} */ | ||
export const removeYiddishDiacritics = { | ||
name: 'Remove Diacritics', | ||
description: 'פאת → פֿאָתּ', | ||
options: [true], | ||
process: (str) => { | ||
return str.replace(/[\u05B0-\u05C7]/g, ''); | ||
}, | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
/* | ||
* Copyright (C) 2024 Yomitan Authors | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
import {suffixInflection} from '../language-transforms.js'; | ||
|
||
/** @typedef {keyof typeof conditions} Condition */ | ||
|
||
const mutations = [ | ||
{new: '\u05e2', orig: '\ufb2e'}, // Ayin to pasekh alef | ||
{new: '\u05e2', orig: '\ufb2f'}, // Ayin to komets alef | ||
{new: '\u05e2', orig: '\u05D0'}, // Ayin to shumter alef | ||
{new: '\u05f1', orig: '\u05e2'}, // Vov yud to ayin | ||
{new: '\u05f2', orig: '\u05f1'}, // Tsvey yudn to Vov yud | ||
{new: '\u05d9', orig: '\u05d5'}, // Yud to Vov | ||
]; | ||
|
||
/** | ||
* @param {string} inflectedSuffix | ||
* @param {string} deinflectedSuffix | ||
* @param {Condition[]} conditionsIn | ||
* @param {Condition[]} conditionsOut | ||
* @returns {import('language-transformer').SuffixRule<Condition>[]} | ||
*/ | ||
function umlautMutationSuffixInflection(inflectedSuffix, deinflectedSuffix, conditionsIn, conditionsOut) { | ||
const suffixRegExp = new RegExp(inflectedSuffix + '$'); | ||
return mutations.map((mutation) => ( | ||
{ | ||
type: 'suffix', | ||
isInflected: suffixRegExp, | ||
deinflected: deinflectedSuffix, | ||
deinflect: (/** @type {string} */ text) => { | ||
const match = new RegExp(/[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F](?!.*[\u05E2\u05F0\u05D0\uFB2E\u05F1\u05D5\u05F2\uFB1D\uFB1F\u05D9\uFB2F])/).exec(text.slice(0, -inflectedSuffix.length)); | ||
return (match?.[0] !== mutation.new) ? '' : text.slice(0, match.index) + mutation.orig + text.slice(match.index + 1, -inflectedSuffix.length) + deinflectedSuffix; | ||
}, | ||
conditionsIn, | ||
conditionsOut, | ||
} | ||
)); | ||
} | ||
|
||
const conditions = { | ||
v: { | ||
name: 'Verb', | ||
isDictionaryForm: true, | ||
subConditions: ['vpast', 'vpresent'], | ||
}, | ||
vpast: { | ||
name: 'Verb, past tense', | ||
isDictionaryForm: false, | ||
}, | ||
vpresent: { | ||
name: 'Verb, present tense', | ||
isDictionaryForm: true, | ||
}, | ||
n: { | ||
name: 'Noun', | ||
isDictionaryForm: true, | ||
subConditions: ['np', 'ns'], | ||
}, | ||
np: { | ||
name: 'Noun, plural', | ||
isDictionaryForm: false, | ||
}, | ||
ns: { | ||
name: 'Noun, singular', | ||
isDictionaryForm: true, | ||
}, | ||
adj: { | ||
name: 'Adjective', | ||
isDictionaryForm: true, | ||
}, | ||
adv: { | ||
name: 'Adverb', | ||
isDictionaryForm: true, | ||
}, | ||
}; | ||
|
||
/** @type {import('language-transformer').LanguageTransformDescriptor<Condition>} */ | ||
export const yiddishTransforms = { | ||
language: 'yi', | ||
conditions, | ||
transforms: { | ||
plural: { | ||
name: 'plural', | ||
description: 'plural form of a noun', | ||
rules: [ | ||
suffixInflection('\u05E1', '', ['np'], ['ns']), // -s | ||
suffixInflection('\u05DF', '', ['np'], ['ns']), // -n | ||
suffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im, hebrew | ||
suffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er | ||
suffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh | ||
suffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en | ||
suffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es | ||
suffixInflection('\u05D5\u05EA', '', ['np'], ['ns']), // -ot, hebrew | ||
suffixInflection('\u05E0\u05E1', '', ['np'], ['ns']), // -ns | ||
suffixInflection('\u05E2\u05E8\u05E2\u05DF', '', ['np'], ['ns']), // -eren | ||
suffixInflection('\u05E2\u05E0\u05E2\u05E1', '', ['np'], ['ns']), // -enes | ||
suffixInflection('\u05E2\u05E0\u05E1', '', ['np'], ['ns']), // -ens | ||
suffixInflection('\u05E2\u05E8\u05E1', '', ['np'], ['ns']), // -ers | ||
suffixInflection('\u05E1\u05E2\u05E8', '', ['np'], ['ns']), // -ser | ||
], | ||
}, | ||
umlaut_plural: { | ||
name: 'umlaut_plural', | ||
description: 'plural form of a umlaut noun', | ||
rules: [ | ||
...umlautMutationSuffixInflection('\u05E2\u05E8', '', ['np'], ['ns']), // -er | ||
...umlautMutationSuffixInflection('\u05E2\u05E1', '', ['np'], ['ns']), // -es | ||
...umlautMutationSuffixInflection('\u05D9\u05DD', '', ['np'], ['ns']), // -im | ||
...umlautMutationSuffixInflection('\u05E2\u05DF', '', ['np'], ['ns']), // -en | ||
...umlautMutationSuffixInflection('\u05DF', '', ['np'], ['ns']), // -n | ||
...umlautMutationSuffixInflection('\u05E1', '', ['np'], ['ns']), // -s | ||
...umlautMutationSuffixInflection('\u05E2\u05DA', '', ['np'], ['ns']), // -ekh | ||
...umlautMutationSuffixInflection('\u05E2\u05E8\u05E1', '', ['np'], ['ns']), // -ers | ||
], | ||
}, | ||
diminutive: { | ||
name: 'diminutive', | ||
description: 'diminutive form of a noun', | ||
rules: [ | ||
suffixInflection('\u05D8\u05E9\u05D9\u05E7', '', ['n'], ['n']), // -tshik | ||
suffixInflection('\u05E7\u05E2', '', ['n'], ['n']), // -ke | ||
suffixInflection('\u05DC', '', ['n'], ['n']), // -l | ||
suffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele | ||
], | ||
}, | ||
diminutive_and_umlaut: { | ||
name: 'diminutive_and_umlaut', | ||
description: 'diminutive form of a noun with stem umlaut', | ||
rules: [ | ||
...umlautMutationSuffixInflection('\u05DC', '', ['n'], ['n']), // -l | ||
...umlautMutationSuffixInflection('\u05E2\u05DC\u05E2', '', ['n'], ['n']), // -ele | ||
], | ||
}, | ||
verb_present_singular_to_first_person: { | ||
name: 'verb_present_singular_to_first_person', | ||
description: 'Turn the second and third person singular form to first person', | ||
rules: [ | ||
suffixInflection('\u05E1\u05D8', '', ['v'], ['vpresent']), // -st | ||
suffixInflection('\u05D8', '', ['v'], ['vpresent']), // -t | ||
suffixInflection('\u05E0\u05D3\u05D9\u05E7', '', ['v'], ['vpresent']), // -ndik | ||
], | ||
}, | ||
verb_present_plural_to_first_person: { | ||
name: 'verb_present_plural_to_first_person', | ||
description: 'Turn the second plural form to first person plural form', | ||
rules: [ | ||
suffixInflection('\u05D8\u05E1', '\u05E0', ['v'], ['vpresent']), // -ts | ||
suffixInflection('\u05D8', '\u05E0', ['v'], ['vpresent']), // -t | ||
], | ||
}, | ||
}, | ||
}; |
Oops, something went wrong.