From fd2dfc10e51099bab0e5732712ee4e97f0a3bae8 Mon Sep 17 00:00:00 2001 From: Lyroxide Date: Mon, 25 Mar 2024 14:53:53 +0800 Subject: [PATCH 1/9] add hangul functions --- .eslintrc.json | 1 + ext/js/language/ko/korean-hangul.js | 586 +++++++++++++++++++ ext/js/language/ko/korean-text-processors.js | 6 +- 3 files changed, 591 insertions(+), 2 deletions(-) create mode 100644 ext/js/language/ko/korean-hangul.js diff --git a/.eslintrc.json b/.eslintrc.json index 4e7503ad71..6fdb8a14d6 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -647,6 +647,7 @@ "ext/js/language/ja/japanese-transforms.js", "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", + "ext/js/language/ko/korean-hangul.js", "ext/js/language/ko/korean-text-processors.js", "ext/js/language/la/latin-text-preprocessors.js", "ext/js/language/language-descriptors.js", diff --git a/ext/js/language/ko/korean-hangul.js b/ext/js/language/ko/korean-hangul.js new file mode 100644 index 0000000000..d9863af79a --- /dev/null +++ b/ext/js/language/ko/korean-hangul.js @@ -0,0 +1,586 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +const HANGUL_OFFSET = 0xAC00; + +const CHO = [ + 'ㄱ', + 'ㄲ', + 'ㄴ', + 'ㄷ', + 'ㄸ', + 'ㄹ', + 'ㅁ', + 'ㅂ', + 'ㅃ', + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅉ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const JUNG = [ + 'ㅏ', + 'ㅐ', + 'ㅑ', + 'ㅒ', + 'ㅓ', + 'ㅔ', + 'ㅕ', + 'ㅖ', + 'ㅗ', + ['ㅗ', 'ㅏ'], + ['ㅗ', 'ㅐ'], + ['ㅗ', 'ㅣ'], + 'ㅛ', + 'ㅜ', + ['ㅜ', 'ㅓ'], + ['ㅜ', 'ㅔ'], + ['ㅜ', 'ㅣ'], + 'ㅠ', + 'ㅡ', + ['ㅡ', 'ㅣ'], + 'ㅣ' +]; + +const JONG = [ + '', + 'ㄱ', + 'ㄲ', + ['ㄱ', 'ㅅ'], + 'ㄴ', + ['ㄴ', 'ㅈ'], + ['ㄴ', 'ㅎ'], + 'ㄷ', + 'ㄹ', + ['ㄹ', 'ㄱ'], + ['ㄹ', 'ㅁ'], + ['ㄹ', 'ㅂ'], + ['ㄹ', 'ㅅ'], + ['ㄹ', 'ㅌ'], + ['ㄹ', 'ㅍ'], + ['ㄹ', 'ㅎ'], + 'ㅁ', + 'ㅂ', + ['ㅂ', 'ㅅ'], + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const CONSONANTS = [ + 'ㄱ', + 'ㄲ', + 'ㄳ', + 'ㄴ', + 'ㄵ', + 'ㄶ', + 'ㄷ', + 'ㄸ', + 'ㄹ', + 'ㄺ', + 'ㄻ', + 'ㄼ', + 'ㄽ', + 'ㄾ', + 'ㄿ', + 'ㅀ', + 'ㅁ', + 'ㅂ', + 'ㅃ', + 'ㅄ', + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅉ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const COMPLETE_CHO = [ + 'ㄱ', + 'ㄲ', + 'ㄴ', + 'ㄷ', + 'ㄸ', + 'ㄹ', + 'ㅁ', + 'ㅂ', + 'ㅃ', + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅉ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const COMPLETE_JUNG = [ + 'ㅏ', + 'ㅐ', + 'ㅑ', + 'ㅒ', + 'ㅓ', + 'ㅔ', + 'ㅕ', + 'ㅖ', + 'ㅗ', + 'ㅘ', + 'ㅙ', + 'ㅚ', + 'ㅛ', + 'ㅜ', + 'ㅝ', + 'ㅞ', + 'ㅟ', + 'ㅠ', + 'ㅡ', + 'ㅢ', + 'ㅣ' +]; + +const COMPLETE_JONG = [ + '', + 'ㄱ', + 'ㄲ', + 'ㄳ', + 'ㄴ', + 'ㄵ', + 'ㄶ', + 'ㄷ', + 'ㄹ', + 'ㄺ', + 'ㄻ', + 'ㄼ', + 'ㄽ', + 'ㄾ', + 'ㄿ', + 'ㅀ', + 'ㅁ', + 'ㅂ', + 'ㅄ', + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const COMPLEX_CONSONANTS = [ + ['ㄱ', 'ㅅ', 'ㄳ'], + ['ㄴ', 'ㅈ', 'ㄵ'], + ['ㄴ', 'ㅎ', 'ㄶ'], + ['ㄹ', 'ㄱ', 'ㄺ'], + ['ㄹ', 'ㅁ', 'ㄻ'], + ['ㄹ', 'ㅂ', 'ㄼ'], + ['ㄹ', 'ㅅ', 'ㄽ'], + ['ㄹ', 'ㅌ', 'ㄾ'], + ['ㄹ', 'ㅍ', 'ㄿ'], + ['ㄹ', 'ㅎ', 'ㅀ'], + ['ㅂ', 'ㅅ', 'ㅄ'] +]; + +const COMPLEX_VOWELS = [ + ['ㅗ', 'ㅏ', 'ㅘ'], + ['ㅗ', 'ㅐ', 'ㅙ'], + ['ㅗ', 'ㅣ', 'ㅚ'], + ['ㅜ', 'ㅓ', 'ㅝ'], + ['ㅜ', 'ㅔ', 'ㅞ'], + ['ㅜ', 'ㅣ', 'ㅟ'], + ['ㅡ', 'ㅣ', 'ㅢ'] +]; + +const makeHash = (array) => { + const hash = {0: 0}; + for (let i = 0; i < array.length; i++) { + if (array[i]) { + hash[array[i].charCodeAt(0)] = i; + } + } + return hash; +}; + +const CONSONANTS_HASH = makeHash(CONSONANTS); + +const CHO_HASH = makeHash(COMPLETE_CHO); + +const JUNG_HASH = makeHash(COMPLETE_JUNG); + +const JONG_HASH = makeHash(COMPLETE_JONG); + +const makeComplexHash = (array) => { + const hash = {}; + let code1, code2; + for (let i = 0; i < array.length; i++) { + code1 = array[i][0].charCodeAt(0); + code2 = array[i][1].charCodeAt(0); + if (typeof hash[code1] === 'undefined') { + hash[code1] = {}; + } + hash[code1][code2] = array[i][2].charCodeAt(0); + } + return hash; +}; + +const COMPLEX_CONSONANTS_HASH = makeComplexHash(COMPLEX_CONSONANTS); + +const COMPLEX_VOWELS_HASH = makeComplexHash(COMPLEX_VOWELS); + +/** + * Checks if the given character is a Korean consonant. + * @param {number} c The character to check. + * @returns {boolean} True if the character is a Korean consonant, false otherwise. + */ +function isConsonant(c) { + return typeof CONSONANTS_HASH[c] !== 'undefined'; +} + +/** + * Checks if the given character is a Korean initial consonant (cho). + * @param {number} c The character to check. + * @returns {boolean} True if the character is a Korean initial consonant, false otherwise. + */ +function isCho(c) { + return typeof CHO_HASH[c] !== 'undefined'; +} + +/** + * Checks if the given character is a Korean vowel (jung). + * @param {number} c The character to check. + * @returns {boolean} True if the character is a Korean vowel, false otherwise. + */ +function isJung(c) { + return typeof JUNG_HASH[c] !== 'undefined'; +} + +/** + * Checks if the given character is a Korean final consonant (jong). + * @param {number} c The character to check. + * @returns {boolean} True if the character is a Korean final consonant, false otherwise. + */ +function isJong(c) { + return typeof JONG_HASH[c] !== 'undefined'; +} + +/** + * Checks if the given character code represents a Hangul character. + * @param {number} charCode The character code to check. + * @returns {boolean} True if the character code represents a Hangul character, false otherwise. + */ +function isHangul(charCode) { + return HANGUL_OFFSET <= charCode && charCode <= 0xd7a3; +} + +/** + * Retrieves the indices of the initial consonant (cho), vowel (jung), and final consonant (jong) + * that make up the given Hangul character code. + * @param {number} charCode The character code of the Hangul character. + * @returns {object} An object containing the indices of cho, jung, and jong. + */ +function getHangulIndices(charCode) { + const baseCode = charCode - HANGUL_OFFSET; + return { + cho: Math.floor(baseCode / 588), + jung: Math.floor((baseCode % 588) / 28), + jong: baseCode % 28 + }; +} + +/** + * Checks if the given characters 'a' and 'b' can be combined to form a complex vowel. + * @param {number} a The character code of the first vowel. + * @param {number} b The character code of the second vowel. + * @returns {(number|boolean)} The character code of the combined complex vowel, or false if they cannot be combined. + */ +function isJungJoinable(a, b) { + return (COMPLEX_VOWELS_HASH[a] && COMPLEX_VOWELS_HASH[a][b]) ? COMPLEX_VOWELS_HASH[a][b] : false; +} + +/** + * Checks if the given characters 'a' and 'b' can be combined to form a complex final consonant. + * @param {number} a The character code of the first final consonant. + * @param {number} b The character code of the second final consonant. + * @returns {(number|boolean)} The character code of the combined complex final consonant, or false if they cannot be combined. + */ +function isJongJoinable(a, b) { + return COMPLEX_CONSONANTS_HASH[a] && COMPLEX_CONSONANTS_HASH[a][b] ? COMPLEX_CONSONANTS_HASH[a][b] : false; +} + +/** + * Disassembles a given string into an array of individual Hangul characters or character components. + * @param {string} string The string to be disassembled. + * @param {boolean} [grouped=false] Whether to group the components of each Hangul character. + * @returns {(string|string[])} An array of individual Hangul characters or character components, or a single string if `grouped` is false. + * @throws {Error} If the input string is null. + */ +export const disassemble = (string, grouped = false) => { + if (string === null) { + throw new Error('Arguments cannot be null'); + } + + string = Array.isArray(string) ? string.join('') : string; + + const result = []; + + for (const character of string) { + const charCode = character.charCodeAt(0); + + if (isHangul(charCode)) { + const {cho, jung, jong} = getHangulIndices(charCode); + + const disassembled = [CHO[cho]]; + + if (Array.isArray(JUNG[jung])) { + disassembled.push(JUNG[jung].join('')); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } else { + disassembled.push(JUNG[jung]); + } + + if (JONG[jong]) { + if (Array.isArray(JONG[jong])) { + disassembled.push(JONG[jong].join('')); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } else { + disassembled.push(JONG[jong]); + } + } + + if (grouped) { + result.push(disassembled); + } else { + result.push(...disassembled); + } + } else if (isConsonant(charCode)) { + if (isCho(charCode)) { + result.push(CHO[CHO_HASH[charCode]]); + } else { + if (Array.isArray(JONG[JONG_HASH[charCode]])) { + result.push(JONG[JONG_HASH[charCode]].join('')); + } else { + result.push(JONG[JONG_HASH[charCode]]); + } + } + } else if (isJung(charCode)) { + if (Array.isArray(JUNG[JUNG_HASH[charCode]])) { + result.push(JUNG[JUNG_HASH[charCode]].join('')); + } else { + result.push(JUNG[JUNG_HASH[charCode]]); + } + } else { + result.push(character); + } + } + return grouped ? result : result.join(''); +}; + +/** + * Assembles an array of individual Hangul characters or character components into a single string. + * @param {string} string The string containing individual Hangul characters or character components to be assembled. + * @returns {string} The assembled string. + * @throws {Error} If the input string is null. + */ +export const assemble = (string) => { + if (string === null) { + throw new Error('Arguments cannot be null'); + } + + const array = [...disassemble(string)]; + + const result = []; + + let complete_index = -1, + jong_joined = false; + + /** + * Helper function to combine jamo into hangul + * @param {number} index Index of a hangul + */ + function makeHangul(index) { + let cho, + jung1, + jung2, + jong1 = 0, + jong2, + hangul = ''; + + jong_joined = false; + + if (complete_index + 1 > index) { + return; + } + + for (let step = 1; ; step++) { + // eslint-disable-next-line unicorn/prefer-switch + if (step === 1) { + cho = array[complete_index + step].charCodeAt(0); + if (isJung(cho)) { + if (complete_index + step + 1 <= index && isJung(jung1 = array[complete_index + step + 1].charCodeAt(0))) { + result.push(String.fromCharCode(isJungJoinable(cho, jung1))); + complete_index = index; + return; + } else { + result.push(array[complete_index + step]); + complete_index = index; + return; + } + } else if (!isCho(cho)) { + result.push(array[complete_index + step]); + complete_index = index; + return; + } + hangul = array[complete_index + step]; + } else if (step === 2) { + jung1 = array[complete_index + step].charCodeAt(0); + if (isCho(jung1)) { + result.push(String.fromCharCode(isJongJoinable(cho, jung1))); + complete_index = index; + return; + } else { + hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + HANGUL_OFFSET); + } + } else if (step === 3) { + jung2 = array[complete_index + step].charCodeAt(0); + if (isJungJoinable(jung1, jung2)) { + jung1 = isJungJoinable(jung1, jung2); + } else { + jong1 = jung2; + } + hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } else if (step === 4) { + jong2 = array[complete_index + step].charCodeAt(0); + jong1 = isJongJoinable(jong1, jong2) ?? jong2; + hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } else if (step === 5) { + jong2 = array[complete_index + step].charCodeAt(0); + jong1 = isJongJoinable(jong1, jong2); + hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } + if (complete_index + step >= index) { + result.push(hangul); + complete_index = index; + return; + } + } + } + + let index, + stage = 0, + previousCharCode; + + for (index = 0; index < array.length; index++) { + const charCode = array[index].charCodeAt(0); + + if (!isCho(charCode) && !isJung(charCode) && !isJong(charCode)) { + makeHangul(index - 1); + makeHangul(index); + stage = 0; + continue; + } + + // eslint-disable-next-line unicorn/prefer-switch + if (stage === 0) { + if (isCho(charCode)) { + stage = 1; + } else if (isJung(charCode)) { + stage = 4; + } + } else if (stage === 1) { + if (isJung(charCode)) { + stage = 2; + } else { + if (isJongJoinable(previousCharCode, charCode)) { + stage = 5; + } else { + makeHangul(index - 1); + } + } + } else if (stage === 2) { + if (isJong(charCode)) { + stage = 3; + } else if (isJung(charCode)) { + if (!isJungJoinable(previousCharCode, charCode)) { + makeHangul(index - 1); + stage = 4; + } + } else { + makeHangul(index - 1); + stage = 1; + } + } else if (stage === 3) { + if (isJong(charCode)) { + if (!jong_joined && isJongJoinable(previousCharCode, charCode)) { + jong_joined = true; + } else { + makeHangul(index - 1); + stage = 1; + } + } else if (isCho(charCode)) { + makeHangul(index - 1); + stage = 1; + } else if (isJung(charCode)) { + makeHangul(index - 2); + stage = 2; + } + } else if (stage === 4) { + if (isJung(charCode)) { + if (isJungJoinable(previousCharCode, charCode)) { + makeHangul(index); + stage = 0; + } else { + makeHangul(index - 1); + } + } else { + makeHangul(index - 1); + stage = 1; + } + } else if (stage === 5) { + if (isJung(charCode)) { + makeHangul(index - 2); + stage = 2; + } else { + makeHangul(index - 1); + stage = 1; + } + } + previousCharCode = charCode; + } + makeHangul(index - 1); + return result.join(''); +}; diff --git a/ext/js/language/ko/korean-text-processors.js b/ext/js/language/ko/korean-text-processors.js index 859ddc80ae..b0df4b413e 100644 --- a/ext/js/language/ko/korean-text-processors.js +++ b/ext/js/language/ko/korean-text-processors.js @@ -15,13 +15,15 @@ * along with this program. If not, see . */ +import {assemble, disassemble} from './korean-hangul.js'; + /** @type {import('language').TextProcessor} */ export const disassembleHangul = { name: 'Disassemble Hangul', description: 'Disassemble Hangul characters into jamo.', options: [true], // Could probably also be set to [false, true], but this way it is always on process: (str) => { - return str; // Import from hangul.js + return disassemble(str); } }; @@ -31,6 +33,6 @@ export const reassembleHangul = { description: 'Reassemble Hangul characters from jamo.', options: [true], // Could probably also be set to [false, true], but this way it is always on process: (str) => { - return str; // Import from hangul.js + return assemble(str); } }; From c36364d1a297046895c34c596dbdd5ca1a3a3d13 Mon Sep 17 00:00:00 2001 From: Lyroxide Date: Mon, 25 Mar 2024 19:56:04 +0800 Subject: [PATCH 2/9] hangul lib --- dev/lib/hangul-js.js | 18 +++++ ext/js/language/ko/korean-text-processors.js | 8 +- ext/js/language/ko/korean-transforms.js | 84 ++++++++++++++++++++ package.json | 1 + 4 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 dev/lib/hangul-js.js create mode 100644 ext/js/language/ko/korean-transforms.js diff --git a/dev/lib/hangul-js.js b/dev/lib/hangul-js.js new file mode 100644 index 0000000000..ed8df1dc3e --- /dev/null +++ b/dev/lib/hangul-js.js @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2023-2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +export * as Hangul from 'hangul-js'; diff --git a/ext/js/language/ko/korean-text-processors.js b/ext/js/language/ko/korean-text-processors.js index b0df4b413e..81aff5ded5 100644 --- a/ext/js/language/ko/korean-text-processors.js +++ b/ext/js/language/ko/korean-text-processors.js @@ -15,7 +15,9 @@ * along with this program. If not, see . */ -import {assemble, disassemble} from './korean-hangul.js'; +// import {assemble, disassemble} from './korean-hangul.js'; + +import Hangul from '../../lib/hangul-js.js'; /** @type {import('language').TextProcessor} */ export const disassembleHangul = { @@ -23,7 +25,7 @@ export const disassembleHangul = { description: 'Disassemble Hangul characters into jamo.', options: [true], // Could probably also be set to [false, true], but this way it is always on process: (str) => { - return disassemble(str); + return Hangul.disassemble(str); } }; @@ -33,6 +35,6 @@ export const reassembleHangul = { description: 'Reassemble Hangul characters from jamo.', options: [true], // Could probably also be set to [false, true], but this way it is always on process: (str) => { - return assemble(str); + return Hangul.assemble(str); } }; diff --git a/ext/js/language/ko/korean-transforms.js b/ext/js/language/ko/korean-transforms.js new file mode 100644 index 0000000000..e48e06dfda --- /dev/null +++ b/ext/js/language/ko/korean-transforms.js @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {suffixInflection} from '../language-transforms.js'; + +/** @type {import('language-transformer').LanguageTransformDescriptor} */ +export const koreanTransforms = { + language: 'ko', + conditions: { + v: { + name: 'Verb or Auxiliary Verb', + partsOfSpeech: ['v'], + i18n: [ + { + language: 'ko', + name: '동사 / 보조 동사' + } + ] + }, + adj: { + name: 'Adjective or Auxiliary Adjective', + partsOfSpeech: ['adj'], + i18n: [ + { + language: 'ko', + name: '형용사 / 보조 형용사' + } + ] + }, + p: { + name: 'Intermediate past tense ending', + partsOfSpeech: [] + }, + f: { + name: 'Intermediate future tense ending', + partsOfSpeech: [] + }, + eusi: { + name: 'Intermediate formal ending', + partsOfSpeech: [] + }, + euob: { + name: 'Intermediate formal ending', + partsOfSpeech: [] + }, + sao: { + name: 'Intermediate formal ending', + partsOfSpeech: [] + }, + saob: { + name: 'Intermediate formal ending', + partsOfSpeech: [] + }, + sab: { + name: 'Intermediate formal ending', + partsOfSpeech: [] + } + }, + transforms: [ + { + name: '거나', + description: 'Conditional', + rules: [ + suffixInflection('ㄱㅓㄴㅏ', 'ㄷㅏ', [], ['v', 'adj']), + suffixInflection('ㄱㅓㄴㅏ', '', [], ['p', 'f', 'euob', 'eusi']), + suffixInflection('ㅇㅣㄱㅓㄴㅏ', 'ㅇㅣㄷㅏ', [], []) + ] + } + ] +}; diff --git a/package.json b/package.json index 1ef6cc496c..ec99f83605 100644 --- a/package.json +++ b/package.json @@ -107,6 +107,7 @@ "@zip.js/zip.js": "^2.7.36", "dexie": "^3.2.5", "dexie-export-import": "^4.1.1", + "hangul-js": "^0.2.6", "parse5": "^7.1.2", "wanakana": "^5.3.1", "yomitan-handlebars": "git+https://github.com/themoeway/yomitan-handlebars.git#12aff5e3550954d7d3a98a5917ff7d579f3cce25" From 475b14019e395cad5a8d8b140dcf60f4cfc08802 Mon Sep 17 00:00:00 2001 From: Stefan Vukovic Date: Mon, 25 Mar 2024 13:13:10 +0100 Subject: [PATCH 3/9] fix tests --- .eslintrc.json | 1 - ext/js/language/ko/korean-hangul.js | 586 ------------------- ext/js/language/ko/korean-text-processors.js | 4 +- ext/js/language/ko/korean-transforms.js | 18 +- package-lock.json | 11 + 5 files changed, 21 insertions(+), 599 deletions(-) delete mode 100644 ext/js/language/ko/korean-hangul.js diff --git a/.eslintrc.json b/.eslintrc.json index 6fdb8a14d6..4e7503ad71 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -647,7 +647,6 @@ "ext/js/language/ja/japanese-transforms.js", "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", - "ext/js/language/ko/korean-hangul.js", "ext/js/language/ko/korean-text-processors.js", "ext/js/language/la/latin-text-preprocessors.js", "ext/js/language/language-descriptors.js", diff --git a/ext/js/language/ko/korean-hangul.js b/ext/js/language/ko/korean-hangul.js deleted file mode 100644 index d9863af79a..0000000000 --- a/ext/js/language/ko/korean-hangul.js +++ /dev/null @@ -1,586 +0,0 @@ -/* - * Copyright (C) 2024 Yomitan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -const HANGUL_OFFSET = 0xAC00; - -const CHO = [ - 'ㄱ', - 'ㄲ', - 'ㄴ', - 'ㄷ', - 'ㄸ', - 'ㄹ', - 'ㅁ', - 'ㅂ', - 'ㅃ', - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅉ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const JUNG = [ - 'ㅏ', - 'ㅐ', - 'ㅑ', - 'ㅒ', - 'ㅓ', - 'ㅔ', - 'ㅕ', - 'ㅖ', - 'ㅗ', - ['ㅗ', 'ㅏ'], - ['ㅗ', 'ㅐ'], - ['ㅗ', 'ㅣ'], - 'ㅛ', - 'ㅜ', - ['ㅜ', 'ㅓ'], - ['ㅜ', 'ㅔ'], - ['ㅜ', 'ㅣ'], - 'ㅠ', - 'ㅡ', - ['ㅡ', 'ㅣ'], - 'ㅣ' -]; - -const JONG = [ - '', - 'ㄱ', - 'ㄲ', - ['ㄱ', 'ㅅ'], - 'ㄴ', - ['ㄴ', 'ㅈ'], - ['ㄴ', 'ㅎ'], - 'ㄷ', - 'ㄹ', - ['ㄹ', 'ㄱ'], - ['ㄹ', 'ㅁ'], - ['ㄹ', 'ㅂ'], - ['ㄹ', 'ㅅ'], - ['ㄹ', 'ㅌ'], - ['ㄹ', 'ㅍ'], - ['ㄹ', 'ㅎ'], - 'ㅁ', - 'ㅂ', - ['ㅂ', 'ㅅ'], - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const CONSONANTS = [ - 'ㄱ', - 'ㄲ', - 'ㄳ', - 'ㄴ', - 'ㄵ', - 'ㄶ', - 'ㄷ', - 'ㄸ', - 'ㄹ', - 'ㄺ', - 'ㄻ', - 'ㄼ', - 'ㄽ', - 'ㄾ', - 'ㄿ', - 'ㅀ', - 'ㅁ', - 'ㅂ', - 'ㅃ', - 'ㅄ', - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅉ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const COMPLETE_CHO = [ - 'ㄱ', - 'ㄲ', - 'ㄴ', - 'ㄷ', - 'ㄸ', - 'ㄹ', - 'ㅁ', - 'ㅂ', - 'ㅃ', - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅉ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const COMPLETE_JUNG = [ - 'ㅏ', - 'ㅐ', - 'ㅑ', - 'ㅒ', - 'ㅓ', - 'ㅔ', - 'ㅕ', - 'ㅖ', - 'ㅗ', - 'ㅘ', - 'ㅙ', - 'ㅚ', - 'ㅛ', - 'ㅜ', - 'ㅝ', - 'ㅞ', - 'ㅟ', - 'ㅠ', - 'ㅡ', - 'ㅢ', - 'ㅣ' -]; - -const COMPLETE_JONG = [ - '', - 'ㄱ', - 'ㄲ', - 'ㄳ', - 'ㄴ', - 'ㄵ', - 'ㄶ', - 'ㄷ', - 'ㄹ', - 'ㄺ', - 'ㄻ', - 'ㄼ', - 'ㄽ', - 'ㄾ', - 'ㄿ', - 'ㅀ', - 'ㅁ', - 'ㅂ', - 'ㅄ', - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const COMPLEX_CONSONANTS = [ - ['ㄱ', 'ㅅ', 'ㄳ'], - ['ㄴ', 'ㅈ', 'ㄵ'], - ['ㄴ', 'ㅎ', 'ㄶ'], - ['ㄹ', 'ㄱ', 'ㄺ'], - ['ㄹ', 'ㅁ', 'ㄻ'], - ['ㄹ', 'ㅂ', 'ㄼ'], - ['ㄹ', 'ㅅ', 'ㄽ'], - ['ㄹ', 'ㅌ', 'ㄾ'], - ['ㄹ', 'ㅍ', 'ㄿ'], - ['ㄹ', 'ㅎ', 'ㅀ'], - ['ㅂ', 'ㅅ', 'ㅄ'] -]; - -const COMPLEX_VOWELS = [ - ['ㅗ', 'ㅏ', 'ㅘ'], - ['ㅗ', 'ㅐ', 'ㅙ'], - ['ㅗ', 'ㅣ', 'ㅚ'], - ['ㅜ', 'ㅓ', 'ㅝ'], - ['ㅜ', 'ㅔ', 'ㅞ'], - ['ㅜ', 'ㅣ', 'ㅟ'], - ['ㅡ', 'ㅣ', 'ㅢ'] -]; - -const makeHash = (array) => { - const hash = {0: 0}; - for (let i = 0; i < array.length; i++) { - if (array[i]) { - hash[array[i].charCodeAt(0)] = i; - } - } - return hash; -}; - -const CONSONANTS_HASH = makeHash(CONSONANTS); - -const CHO_HASH = makeHash(COMPLETE_CHO); - -const JUNG_HASH = makeHash(COMPLETE_JUNG); - -const JONG_HASH = makeHash(COMPLETE_JONG); - -const makeComplexHash = (array) => { - const hash = {}; - let code1, code2; - for (let i = 0; i < array.length; i++) { - code1 = array[i][0].charCodeAt(0); - code2 = array[i][1].charCodeAt(0); - if (typeof hash[code1] === 'undefined') { - hash[code1] = {}; - } - hash[code1][code2] = array[i][2].charCodeAt(0); - } - return hash; -}; - -const COMPLEX_CONSONANTS_HASH = makeComplexHash(COMPLEX_CONSONANTS); - -const COMPLEX_VOWELS_HASH = makeComplexHash(COMPLEX_VOWELS); - -/** - * Checks if the given character is a Korean consonant. - * @param {number} c The character to check. - * @returns {boolean} True if the character is a Korean consonant, false otherwise. - */ -function isConsonant(c) { - return typeof CONSONANTS_HASH[c] !== 'undefined'; -} - -/** - * Checks if the given character is a Korean initial consonant (cho). - * @param {number} c The character to check. - * @returns {boolean} True if the character is a Korean initial consonant, false otherwise. - */ -function isCho(c) { - return typeof CHO_HASH[c] !== 'undefined'; -} - -/** - * Checks if the given character is a Korean vowel (jung). - * @param {number} c The character to check. - * @returns {boolean} True if the character is a Korean vowel, false otherwise. - */ -function isJung(c) { - return typeof JUNG_HASH[c] !== 'undefined'; -} - -/** - * Checks if the given character is a Korean final consonant (jong). - * @param {number} c The character to check. - * @returns {boolean} True if the character is a Korean final consonant, false otherwise. - */ -function isJong(c) { - return typeof JONG_HASH[c] !== 'undefined'; -} - -/** - * Checks if the given character code represents a Hangul character. - * @param {number} charCode The character code to check. - * @returns {boolean} True if the character code represents a Hangul character, false otherwise. - */ -function isHangul(charCode) { - return HANGUL_OFFSET <= charCode && charCode <= 0xd7a3; -} - -/** - * Retrieves the indices of the initial consonant (cho), vowel (jung), and final consonant (jong) - * that make up the given Hangul character code. - * @param {number} charCode The character code of the Hangul character. - * @returns {object} An object containing the indices of cho, jung, and jong. - */ -function getHangulIndices(charCode) { - const baseCode = charCode - HANGUL_OFFSET; - return { - cho: Math.floor(baseCode / 588), - jung: Math.floor((baseCode % 588) / 28), - jong: baseCode % 28 - }; -} - -/** - * Checks if the given characters 'a' and 'b' can be combined to form a complex vowel. - * @param {number} a The character code of the first vowel. - * @param {number} b The character code of the second vowel. - * @returns {(number|boolean)} The character code of the combined complex vowel, or false if they cannot be combined. - */ -function isJungJoinable(a, b) { - return (COMPLEX_VOWELS_HASH[a] && COMPLEX_VOWELS_HASH[a][b]) ? COMPLEX_VOWELS_HASH[a][b] : false; -} - -/** - * Checks if the given characters 'a' and 'b' can be combined to form a complex final consonant. - * @param {number} a The character code of the first final consonant. - * @param {number} b The character code of the second final consonant. - * @returns {(number|boolean)} The character code of the combined complex final consonant, or false if they cannot be combined. - */ -function isJongJoinable(a, b) { - return COMPLEX_CONSONANTS_HASH[a] && COMPLEX_CONSONANTS_HASH[a][b] ? COMPLEX_CONSONANTS_HASH[a][b] : false; -} - -/** - * Disassembles a given string into an array of individual Hangul characters or character components. - * @param {string} string The string to be disassembled. - * @param {boolean} [grouped=false] Whether to group the components of each Hangul character. - * @returns {(string|string[])} An array of individual Hangul characters or character components, or a single string if `grouped` is false. - * @throws {Error} If the input string is null. - */ -export const disassemble = (string, grouped = false) => { - if (string === null) { - throw new Error('Arguments cannot be null'); - } - - string = Array.isArray(string) ? string.join('') : string; - - const result = []; - - for (const character of string) { - const charCode = character.charCodeAt(0); - - if (isHangul(charCode)) { - const {cho, jung, jong} = getHangulIndices(charCode); - - const disassembled = [CHO[cho]]; - - if (Array.isArray(JUNG[jung])) { - disassembled.push(JUNG[jung].join('')); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } else { - disassembled.push(JUNG[jung]); - } - - if (JONG[jong]) { - if (Array.isArray(JONG[jong])) { - disassembled.push(JONG[jong].join('')); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } else { - disassembled.push(JONG[jong]); - } - } - - if (grouped) { - result.push(disassembled); - } else { - result.push(...disassembled); - } - } else if (isConsonant(charCode)) { - if (isCho(charCode)) { - result.push(CHO[CHO_HASH[charCode]]); - } else { - if (Array.isArray(JONG[JONG_HASH[charCode]])) { - result.push(JONG[JONG_HASH[charCode]].join('')); - } else { - result.push(JONG[JONG_HASH[charCode]]); - } - } - } else if (isJung(charCode)) { - if (Array.isArray(JUNG[JUNG_HASH[charCode]])) { - result.push(JUNG[JUNG_HASH[charCode]].join('')); - } else { - result.push(JUNG[JUNG_HASH[charCode]]); - } - } else { - result.push(character); - } - } - return grouped ? result : result.join(''); -}; - -/** - * Assembles an array of individual Hangul characters or character components into a single string. - * @param {string} string The string containing individual Hangul characters or character components to be assembled. - * @returns {string} The assembled string. - * @throws {Error} If the input string is null. - */ -export const assemble = (string) => { - if (string === null) { - throw new Error('Arguments cannot be null'); - } - - const array = [...disassemble(string)]; - - const result = []; - - let complete_index = -1, - jong_joined = false; - - /** - * Helper function to combine jamo into hangul - * @param {number} index Index of a hangul - */ - function makeHangul(index) { - let cho, - jung1, - jung2, - jong1 = 0, - jong2, - hangul = ''; - - jong_joined = false; - - if (complete_index + 1 > index) { - return; - } - - for (let step = 1; ; step++) { - // eslint-disable-next-line unicorn/prefer-switch - if (step === 1) { - cho = array[complete_index + step].charCodeAt(0); - if (isJung(cho)) { - if (complete_index + step + 1 <= index && isJung(jung1 = array[complete_index + step + 1].charCodeAt(0))) { - result.push(String.fromCharCode(isJungJoinable(cho, jung1))); - complete_index = index; - return; - } else { - result.push(array[complete_index + step]); - complete_index = index; - return; - } - } else if (!isCho(cho)) { - result.push(array[complete_index + step]); - complete_index = index; - return; - } - hangul = array[complete_index + step]; - } else if (step === 2) { - jung1 = array[complete_index + step].charCodeAt(0); - if (isCho(jung1)) { - result.push(String.fromCharCode(isJongJoinable(cho, jung1))); - complete_index = index; - return; - } else { - hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + HANGUL_OFFSET); - } - } else if (step === 3) { - jung2 = array[complete_index + step].charCodeAt(0); - if (isJungJoinable(jung1, jung2)) { - jung1 = isJungJoinable(jung1, jung2); - } else { - jong1 = jung2; - } - hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } else if (step === 4) { - jong2 = array[complete_index + step].charCodeAt(0); - jong1 = isJongJoinable(jong1, jong2) ?? jong2; - hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } else if (step === 5) { - jong2 = array[complete_index + step].charCodeAt(0); - jong1 = isJongJoinable(jong1, jong2); - hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } - if (complete_index + step >= index) { - result.push(hangul); - complete_index = index; - return; - } - } - } - - let index, - stage = 0, - previousCharCode; - - for (index = 0; index < array.length; index++) { - const charCode = array[index].charCodeAt(0); - - if (!isCho(charCode) && !isJung(charCode) && !isJong(charCode)) { - makeHangul(index - 1); - makeHangul(index); - stage = 0; - continue; - } - - // eslint-disable-next-line unicorn/prefer-switch - if (stage === 0) { - if (isCho(charCode)) { - stage = 1; - } else if (isJung(charCode)) { - stage = 4; - } - } else if (stage === 1) { - if (isJung(charCode)) { - stage = 2; - } else { - if (isJongJoinable(previousCharCode, charCode)) { - stage = 5; - } else { - makeHangul(index - 1); - } - } - } else if (stage === 2) { - if (isJong(charCode)) { - stage = 3; - } else if (isJung(charCode)) { - if (!isJungJoinable(previousCharCode, charCode)) { - makeHangul(index - 1); - stage = 4; - } - } else { - makeHangul(index - 1); - stage = 1; - } - } else if (stage === 3) { - if (isJong(charCode)) { - if (!jong_joined && isJongJoinable(previousCharCode, charCode)) { - jong_joined = true; - } else { - makeHangul(index - 1); - stage = 1; - } - } else if (isCho(charCode)) { - makeHangul(index - 1); - stage = 1; - } else if (isJung(charCode)) { - makeHangul(index - 2); - stage = 2; - } - } else if (stage === 4) { - if (isJung(charCode)) { - if (isJungJoinable(previousCharCode, charCode)) { - makeHangul(index); - stage = 0; - } else { - makeHangul(index - 1); - } - } else { - makeHangul(index - 1); - stage = 1; - } - } else if (stage === 5) { - if (isJung(charCode)) { - makeHangul(index - 2); - stage = 2; - } else { - makeHangul(index - 1); - stage = 1; - } - } - previousCharCode = charCode; - } - makeHangul(index - 1); - return result.join(''); -}; diff --git a/ext/js/language/ko/korean-text-processors.js b/ext/js/language/ko/korean-text-processors.js index 81aff5ded5..3ef0cb0442 100644 --- a/ext/js/language/ko/korean-text-processors.js +++ b/ext/js/language/ko/korean-text-processors.js @@ -15,9 +15,7 @@ * along with this program. If not, see . */ -// import {assemble, disassemble} from './korean-hangul.js'; - -import Hangul from '../../lib/hangul-js.js'; +import {Hangul} from '../../../lib/hangul-js.js'; /** @type {import('language').TextProcessor} */ export const disassembleHangul = { diff --git a/ext/js/language/ko/korean-transforms.js b/ext/js/language/ko/korean-transforms.js index e48e06dfda..0596ec9dc1 100644 --- a/ext/js/language/ko/korean-transforms.js +++ b/ext/js/language/ko/korean-transforms.js @@ -23,7 +23,7 @@ export const koreanTransforms = { conditions: { v: { name: 'Verb or Auxiliary Verb', - partsOfSpeech: ['v'], + isDictionaryForm: true, i18n: [ { language: 'ko', @@ -33,7 +33,7 @@ export const koreanTransforms = { }, adj: { name: 'Adjective or Auxiliary Adjective', - partsOfSpeech: ['adj'], + isDictionaryForm: true, i18n: [ { language: 'ko', @@ -43,31 +43,31 @@ export const koreanTransforms = { }, p: { name: 'Intermediate past tense ending', - partsOfSpeech: [] + isDictionaryForm: false }, f: { name: 'Intermediate future tense ending', - partsOfSpeech: [] + isDictionaryForm: false }, eusi: { name: 'Intermediate formal ending', - partsOfSpeech: [] + isDictionaryForm: false }, euob: { name: 'Intermediate formal ending', - partsOfSpeech: [] + isDictionaryForm: false }, sao: { name: 'Intermediate formal ending', - partsOfSpeech: [] + isDictionaryForm: false }, saob: { name: 'Intermediate formal ending', - partsOfSpeech: [] + isDictionaryForm: false }, sab: { name: 'Intermediate formal ending', - partsOfSpeech: [] + isDictionaryForm: false } }, transforms: [ diff --git a/package-lock.json b/package-lock.json index 5016c50d0e..e0ac7bf4b3 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,6 +12,7 @@ "@zip.js/zip.js": "^2.7.36", "dexie": "^3.2.5", "dexie-export-import": "^4.1.1", + "hangul-js": "^0.2.6", "parse5": "^7.1.2", "wanakana": "^5.3.1", "yomitan-handlebars": "git+https://github.com/themoeway/yomitan-handlebars.git#12aff5e3550954d7d3a98a5917ff7d579f3cce25" @@ -5156,6 +5157,11 @@ "uglify-js": "^3.1.4" } }, + "node_modules/hangul-js": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/hangul-js/-/hangul-js-0.2.6.tgz", + "integrity": "sha512-48axU8LgjCD30FEs66Xc04/8knxMwCMQw0f67l67rlttW7VXT3qRJgQeHmhiuGwWXGvSbk6YM0fhQlcjE1JFQA==" + }, "node_modules/has-bigints": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.0.2.tgz", @@ -14019,6 +14025,11 @@ "wordwrap": "^1.0.0" } }, + "hangul-js": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/hangul-js/-/hangul-js-0.2.6.tgz", + "integrity": "sha512-48axU8LgjCD30FEs66Xc04/8knxMwCMQw0f67l67rlttW7VXT3qRJgQeHmhiuGwWXGvSbk6YM0fhQlcjE1JFQA==" + }, "has-bigints": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.0.2.tgz", From f9be9c78041f43f1bcf47e308103d57ab4267138 Mon Sep 17 00:00:00 2001 From: Lyroxide Date: Mon, 25 Mar 2024 20:55:58 +0800 Subject: [PATCH 4/9] test inputs --- .eslintrc.json | 1 + ext/js/language/ko/korean-transforms.js | 1 - ext/js/language/language-descriptors.js | 5 +++-- .../valid-dictionary1/term_bank_1.json | 3 ++- test/data/translator-test-inputs.json | 14 ++++++++++++++ 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/.eslintrc.json b/.eslintrc.json index 4e7503ad71..0d60e4d709 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -648,6 +648,7 @@ "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", "ext/js/language/ko/korean-text-processors.js", + "ext/js/language/ko/korean-transforms.js", "ext/js/language/la/latin-text-preprocessors.js", "ext/js/language/language-descriptors.js", "ext/js/language/language-transformer.js", diff --git a/ext/js/language/ko/korean-transforms.js b/ext/js/language/ko/korean-transforms.js index 0596ec9dc1..5ec619125f 100644 --- a/ext/js/language/ko/korean-transforms.js +++ b/ext/js/language/ko/korean-transforms.js @@ -73,7 +73,6 @@ export const koreanTransforms = { transforms: [ { name: '거나', - description: 'Conditional', rules: [ suffixInflection('ㄱㅓㄴㅏ', 'ㄷㅏ', [], ['v', 'adj']), suffixInflection('ㄱㅓㄴㅏ', '', [], ['p', 'f', 'euob', 'eusi']), diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 6782cd8088..09fde3029e 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -22,6 +22,7 @@ import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidth import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js'; +import {koreanTransforms} from './ko/korean-transforms.js'; import {removeLatinDiacritics} from './la/latin-text-preprocessors.js'; import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; @@ -145,8 +146,8 @@ const languageDescriptors = [ }, textPostprocessors: { reassembleHangul - } - // languageTransforms: koreanTransforms + }, + languageTransforms: koreanTransforms }, { iso: 'pl', diff --git a/test/data/dictionaries/valid-dictionary1/term_bank_1.json b/test/data/dictionaries/valid-dictionary1/term_bank_1.json index e7fb015cad..a362de2339 100644 --- a/test/data/dictionaries/valid-dictionary1/term_bank_1.json +++ b/test/data/dictionaries/valid-dictionary1/term_bank_1.json @@ -343,5 +343,6 @@ ["39", "さんきゅう", "", "", 1, ["sankyuu definition"], 17, ""], ["凄い", "すごい", "adj-i", "adj-i", 1, ["sugoi definition"], 18, ""], ["English", "", "n", "n", 1, ["English definition"], 19, ""], - ["language", "", "n", "n", 1, ["language definition"], 20, ""] + ["language", "", "n", "n", 1, ["language definition"], 20, ""], + ["마시다", "", "v", "v", 1, ["masida definition"], 21, ""] ] diff --git a/test/data/translator-test-inputs.json b/test/data/translator-test-inputs.json index c904771684..7ad69d2fcd 100644 --- a/test/data/translator-test-inputs.json +++ b/test/data/translator-test-inputs.json @@ -456,6 +456,20 @@ "removeNonJapaneseCharacters": false } ] + }, + { + "name": "Test korean deinflection", + "func": "findTerms", + "mode": "split", + "text": "마시거나", + "options": [ + "default", + { + "type": "terms", + "language": "ko", + "removeNonJapaneseCharacters": false + } + ] } ] } From 202896ae80e9745a65ceac2e874e19193d888b4b Mon Sep 17 00:00:00 2001 From: Stefan Vukovic Date: Mon, 25 Mar 2024 14:29:40 +0100 Subject: [PATCH 5/9] fix hangul disassemble, fix postprocessing --- ext/js/language/ko/korean-text-processors.js | 2 +- ext/js/language/translator.js | 6 +- test/data/anki-note-builder-test-results.json | 42 +++++++ test/data/database-test-cases.json | 6 +- .../translator-test-results-note-data1.json | 116 ++++++++++++++++++ test/data/translator-test-results.json | 81 ++++++++++++ 6 files changed, 247 insertions(+), 6 deletions(-) diff --git a/ext/js/language/ko/korean-text-processors.js b/ext/js/language/ko/korean-text-processors.js index 3ef0cb0442..c77510c255 100644 --- a/ext/js/language/ko/korean-text-processors.js +++ b/ext/js/language/ko/korean-text-processors.js @@ -23,7 +23,7 @@ export const disassembleHangul = { description: 'Disassemble Hangul characters into jamo.', options: [true], // Could probably also be set to [false, true], but this way it is always on process: (str) => { - return Hangul.disassemble(str); + return Hangul.disassemble(str, false).join(''); } }; diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index bc53e2a625..549d8cfafc 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -480,12 +480,14 @@ export class Translator { if (used.has(source)) { break; } used.add(source); const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); - for (const {text: transformedText, conditions, trace} of this._multiLanguageTransformer.transform(language, source)) { + for (const deinflection of this._multiLanguageTransformer.transform(language, source)) { + const {trace, conditions} = deinflection; + let {text: transformedText} = deinflection; for (const postprocessorVariant of this._generateArrayVariants(postprocessorOptionsSpace)) { for (const postprocessor of textPostprocessors.values()) { const {id, textProcessor} = postprocessor; const setting = postprocessorVariant.get(id); - text2 = textProcessor.process(text2, setting, sourceMap); + transformedText = textProcessor.process(transformedText, setting, sourceMap); } /** @type {import('dictionary').InflectionRuleChainCandidate} */ diff --git a/test/data/anki-note-builder-test-results.json b/test/data/anki-note-builder-test-results.json index 1d84712dea..9733695f9b 100644 --- a/test/data/anki-note-builder-test-results.json +++ b/test/data/anki-note-builder-test-results.json @@ -3919,5 +3919,47 @@ "url": "url:" } ] + }, + { + "name": "Test korean deinflection", + "results": [ + { + "audio": "", + "clipboard-image": "", + "clipboard-text": "", + "cloze-body": "마시거나", + "cloze-body-kana": "마시거나", + "cloze-prefix": "cloze-prefix", + "cloze-suffix": "cloze-suffix", + "conjugation": "거나", + "dictionary": "Test Dictionary 2", + "document-title": "title", + "expression": "마시다", + "frequencies": "", + "frequency-harmonic-rank": "9999999", + "frequency-harmonic-occurrence": "0", + "frequency-average-rank": "9999999", + "frequency-average-occurrence": "0", + "furigana": "마시다", + "furigana-plain": "마시다", + "glossary": "
(v, Test Dictionary 2) masida definition
", + "glossary-brief": "
masida definition
", + "glossary-no-dictionary": "
(v) masida definition
", + "part-of-speech": "v", + "pitch-accents": "No pitch accent data", + "pitch-accent-graphs": "No pitch accent data", + "pitch-accent-positions": "No pitch accent data", + "pitch-accent-categories": "", + "phonetic-transcriptions": "", + "reading": "마시다", + "screenshot": "", + "search-query": "fullQuery", + "selection-text": "", + "sentence": "cloze-prefix마시거나cloze-suffix", + "sentence-furigana": "cloze-prefix마시거나cloze-suffix", + "tags": "v", + "url": "url:" + } + ] } ] diff --git a/test/data/database-test-cases.json b/test/data/database-test-cases.json index 5747f59ef0..054ede34bc 100644 --- a/test/data/database-test-cases.json +++ b/test/data/database-test-cases.json @@ -27,7 +27,7 @@ "ipa": 1 }, "terms": { - "total": 29 + "total": 30 } } }, @@ -36,7 +36,7 @@ { "kanji": 2, "kanjiMeta": 6, - "terms": 29, + "terms": 30, "termMeta": 39, "tagMeta": 15, "media": 6 @@ -45,7 +45,7 @@ "total": { "kanji": 2, "kanjiMeta": 6, - "terms": 29, + "terms": 30, "termMeta": 39, "tagMeta": 15, "media": 6 diff --git a/test/data/translator-test-results-note-data1.json b/test/data/translator-test-results-note-data1.json index f580ac53bc..0eb5acf6b7 100644 --- a/test/data/translator-test-results-note-data1.json +++ b/test/data/translator-test-results-note-data1.json @@ -32940,5 +32940,121 @@ "media": {} } ] + }, + { + "name": "Test korean deinflection", + "noteDataList": [ + { + "marker": "{marker}", + "definition": { + "type": "term", + "id": 29, + "source": "ㅁㅏㅅㅣㄱㅓㄴㅏ", + "rawSource": "마시거나", + "sourceTerm": "마시다", + "inflectionRuleChainCandidates": [ + { + "source": "algorithm", + "inflectionRules": [ + "거나" + ] + } + ], + "score": 1, + "isPrimary": true, + "sequence": 21, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "dictionaryNames": [ + "Test Dictionary 2" + ], + "expression": "마시다", + "reading": "마시다", + "expressions": [ + { + "sourceTerm": "마시다", + "expression": "마시다", + "reading": "마시다", + "termTags": [], + "frequencies": [], + "pitches": [], + "furiganaSegments": [ + { + "text": "마시다", + "furigana": "" + } + ], + "termFrequency": "normal", + "wordClasses": [ + "v" + ] + } + ], + "glossary": [ + "masida definition" + ], + "definitionTags": [ + { + "name": "v", + "category": "default", + "notes": "", + "order": 0, + "score": 0, + "dictionary": "Test Dictionary 2", + "redundant": false + } + ], + "termTags": [], + "frequencies": [], + "frequencyHarmonic": -1, + "frequencyAverage": -1, + "pitches": [], + "phoneticTranscriptions": [], + "sourceTermExactMatchCount": 1, + "url": "url:", + "cloze": { + "sentence": "", + "prefix": "", + "body": "", + "bodyKana": "", + "suffix": "" + }, + "furiganaSegments": [ + { + "text": "마시다", + "furigana": "" + } + ] + }, + "glossaryLayoutMode": "default", + "compactTags": false, + "group": false, + "merge": false, + "modeTermKanji": false, + "modeTermKana": false, + "modeKanji": false, + "compactGlossaries": false, + "uniqueExpressions": [ + "마시다" + ], + "uniqueReadings": [ + "마시다" + ], + "pitches": [], + "pitchCount": 0, + "phoneticTranscriptions": [], + "context": { + "query": "query", + "fullQuery": "fullQuery", + "document": { + "title": "title" + } + }, + "media": {} + } + ] } ] diff --git a/test/data/translator-test-results.json b/test/data/translator-test-results.json index cd3f7ab6c0..ac368bc289 100644 --- a/test/data/translator-test-results.json +++ b/test/data/translator-test-results.json @@ -18604,5 +18604,86 @@ "frequencies": [] } ] + }, + { + "name": "Test korean deinflection", + "originalTextLength": 4, + "dictionaryEntries": [ + { + "type": "term", + "isPrimary": true, + "inflectionRuleChainCandidates": [ + { + "source": "algorithm", + "inflectionRules": [ + "거나" + ] + } + ], + "score": 1, + "frequencyOrder": 0, + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "sourceTermExactMatchCount": 1, + "maxTransformedTextLength": 8, + "headwords": [ + { + "index": 0, + "term": "마시다", + "reading": "마시다", + "sources": [ + { + "originalText": "마시거나", + "transformedText": "ㅁㅏㅅㅣㄱㅓㄴㅏ", + "deinflectedText": "마시다", + "matchType": "exact", + "matchSource": "term", + "isPrimary": true + } + ], + "tags": [], + "wordClasses": [ + "v" + ] + } + ], + "definitions": [ + { + "index": 0, + "headwordIndices": [ + 0 + ], + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "id": 29, + "score": 1, + "frequencyOrder": 0, + "sequences": [ + 21 + ], + "isPrimary": true, + "tags": [ + { + "name": "v", + "category": "default", + "order": 0, + "score": 0, + "content": [], + "dictionaries": [ + "Test Dictionary 2" + ], + "redundant": false + } + ], + "entries": [ + "masida definition" + ] + } + ], + "pronunciations": [], + "frequencies": [] + } + ] } ] From f2ddc6d24963c3a8b66d57af2863ff2a5e78304d Mon Sep 17 00:00:00 2001 From: Stefan Vukovic Date: Mon, 25 Mar 2024 14:33:50 +0100 Subject: [PATCH 6/9] rename test --- test/data/anki-note-builder-test-results.json | 2 +- test/data/translator-test-inputs.json | 2 +- test/data/translator-test-results-note-data1.json | 2 +- test/data/translator-test-results.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/data/anki-note-builder-test-results.json b/test/data/anki-note-builder-test-results.json index 9733695f9b..4eeb9f8e46 100644 --- a/test/data/anki-note-builder-test-results.json +++ b/test/data/anki-note-builder-test-results.json @@ -3921,7 +3921,7 @@ ] }, { - "name": "Test korean deinflection", + "name": "Test text postprocessing", "results": [ { "audio": "", diff --git a/test/data/translator-test-inputs.json b/test/data/translator-test-inputs.json index 7ad69d2fcd..cad84f0ade 100644 --- a/test/data/translator-test-inputs.json +++ b/test/data/translator-test-inputs.json @@ -458,7 +458,7 @@ ] }, { - "name": "Test korean deinflection", + "name": "Test text postprocessing", "func": "findTerms", "mode": "split", "text": "마시거나", diff --git a/test/data/translator-test-results-note-data1.json b/test/data/translator-test-results-note-data1.json index 0eb5acf6b7..06613c95c6 100644 --- a/test/data/translator-test-results-note-data1.json +++ b/test/data/translator-test-results-note-data1.json @@ -32942,7 +32942,7 @@ ] }, { - "name": "Test korean deinflection", + "name": "Test text postprocessing", "noteDataList": [ { "marker": "{marker}", diff --git a/test/data/translator-test-results.json b/test/data/translator-test-results.json index ac368bc289..ba7b17d0f3 100644 --- a/test/data/translator-test-results.json +++ b/test/data/translator-test-results.json @@ -18606,7 +18606,7 @@ ] }, { - "name": "Test korean deinflection", + "name": "Test text postprocessing", "originalTextLength": 4, "dictionaryEntries": [ { From 8fe34359a6849404153d249b3b7789c570d74e59 Mon Sep 17 00:00:00 2001 From: Stefan Vukovic Date: Mon, 25 Mar 2024 14:39:52 +0100 Subject: [PATCH 7/9] reset transformedText for multiple postprocessors --- ext/js/language/translator.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index 549d8cfafc..b7960f8128 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -482,8 +482,8 @@ export class Translator { const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); for (const deinflection of this._multiLanguageTransformer.transform(language, source)) { const {trace, conditions} = deinflection; - let {text: transformedText} = deinflection; for (const postprocessorVariant of this._generateArrayVariants(postprocessorOptionsSpace)) { + let {text: transformedText} = deinflection; for (const postprocessor of textPostprocessors.values()) { const {id, textProcessor} = postprocessor; const setting = postprocessorVariant.get(id); From 12b57bc8af4e7e1d4a4b6f3b7cb8bce5f4e9e696 Mon Sep 17 00:00:00 2001 From: Lyroxide Date: Tue, 26 Mar 2024 00:16:00 +0800 Subject: [PATCH 8/9] add credits --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c29c918bc2..0d3394cb8d 100644 --- a/README.md +++ b/README.md @@ -125,3 +125,4 @@ Yomitan uses several third-party libraries to function. | yomitan-handlebars | 1.0.0 | MIT | n/a | | parse5 | 7.1.2 | MIT | git://github.com/inikulin/parse5.git | | wanakana | 5.3.1 | MIT | git+ssh://git@github.com/WaniKani/WanaKana.git | +| hangul.js | 0.2.6 | MIT | git+https://github.com/e-/Hangul.js.git | From 7a1ac8b6718698c94dbd692fba92d0f871aed9bf Mon Sep 17 00:00:00 2001 From: Lyroxide Date: Tue, 26 Mar 2024 00:31:54 +0800 Subject: [PATCH 9/9] fix conflict --- .eslintrc.json | 1 - ext/js/language/ko/korean-hangul.js | 586 ---------------------------- 2 files changed, 587 deletions(-) delete mode 100644 ext/js/language/ko/korean-hangul.js diff --git a/.eslintrc.json b/.eslintrc.json index f91187c821..0d60e4d709 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -647,7 +647,6 @@ "ext/js/language/ja/japanese-transforms.js", "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", - "ext/js/language/ko/korean-hangul.js", "ext/js/language/ko/korean-text-processors.js", "ext/js/language/ko/korean-transforms.js", "ext/js/language/la/latin-text-preprocessors.js", diff --git a/ext/js/language/ko/korean-hangul.js b/ext/js/language/ko/korean-hangul.js deleted file mode 100644 index d9863af79a..0000000000 --- a/ext/js/language/ko/korean-hangul.js +++ /dev/null @@ -1,586 +0,0 @@ -/* - * Copyright (C) 2024 Yomitan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -const HANGUL_OFFSET = 0xAC00; - -const CHO = [ - 'ㄱ', - 'ㄲ', - 'ㄴ', - 'ㄷ', - 'ㄸ', - 'ㄹ', - 'ㅁ', - 'ㅂ', - 'ㅃ', - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅉ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const JUNG = [ - 'ㅏ', - 'ㅐ', - 'ㅑ', - 'ㅒ', - 'ㅓ', - 'ㅔ', - 'ㅕ', - 'ㅖ', - 'ㅗ', - ['ㅗ', 'ㅏ'], - ['ㅗ', 'ㅐ'], - ['ㅗ', 'ㅣ'], - 'ㅛ', - 'ㅜ', - ['ㅜ', 'ㅓ'], - ['ㅜ', 'ㅔ'], - ['ㅜ', 'ㅣ'], - 'ㅠ', - 'ㅡ', - ['ㅡ', 'ㅣ'], - 'ㅣ' -]; - -const JONG = [ - '', - 'ㄱ', - 'ㄲ', - ['ㄱ', 'ㅅ'], - 'ㄴ', - ['ㄴ', 'ㅈ'], - ['ㄴ', 'ㅎ'], - 'ㄷ', - 'ㄹ', - ['ㄹ', 'ㄱ'], - ['ㄹ', 'ㅁ'], - ['ㄹ', 'ㅂ'], - ['ㄹ', 'ㅅ'], - ['ㄹ', 'ㅌ'], - ['ㄹ', 'ㅍ'], - ['ㄹ', 'ㅎ'], - 'ㅁ', - 'ㅂ', - ['ㅂ', 'ㅅ'], - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const CONSONANTS = [ - 'ㄱ', - 'ㄲ', - 'ㄳ', - 'ㄴ', - 'ㄵ', - 'ㄶ', - 'ㄷ', - 'ㄸ', - 'ㄹ', - 'ㄺ', - 'ㄻ', - 'ㄼ', - 'ㄽ', - 'ㄾ', - 'ㄿ', - 'ㅀ', - 'ㅁ', - 'ㅂ', - 'ㅃ', - 'ㅄ', - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅉ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const COMPLETE_CHO = [ - 'ㄱ', - 'ㄲ', - 'ㄴ', - 'ㄷ', - 'ㄸ', - 'ㄹ', - 'ㅁ', - 'ㅂ', - 'ㅃ', - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅉ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const COMPLETE_JUNG = [ - 'ㅏ', - 'ㅐ', - 'ㅑ', - 'ㅒ', - 'ㅓ', - 'ㅔ', - 'ㅕ', - 'ㅖ', - 'ㅗ', - 'ㅘ', - 'ㅙ', - 'ㅚ', - 'ㅛ', - 'ㅜ', - 'ㅝ', - 'ㅞ', - 'ㅟ', - 'ㅠ', - 'ㅡ', - 'ㅢ', - 'ㅣ' -]; - -const COMPLETE_JONG = [ - '', - 'ㄱ', - 'ㄲ', - 'ㄳ', - 'ㄴ', - 'ㄵ', - 'ㄶ', - 'ㄷ', - 'ㄹ', - 'ㄺ', - 'ㄻ', - 'ㄼ', - 'ㄽ', - 'ㄾ', - 'ㄿ', - 'ㅀ', - 'ㅁ', - 'ㅂ', - 'ㅄ', - 'ㅅ', - 'ㅆ', - 'ㅇ', - 'ㅈ', - 'ㅊ', - 'ㅋ', - 'ㅌ', - 'ㅍ', - 'ㅎ' -]; - -const COMPLEX_CONSONANTS = [ - ['ㄱ', 'ㅅ', 'ㄳ'], - ['ㄴ', 'ㅈ', 'ㄵ'], - ['ㄴ', 'ㅎ', 'ㄶ'], - ['ㄹ', 'ㄱ', 'ㄺ'], - ['ㄹ', 'ㅁ', 'ㄻ'], - ['ㄹ', 'ㅂ', 'ㄼ'], - ['ㄹ', 'ㅅ', 'ㄽ'], - ['ㄹ', 'ㅌ', 'ㄾ'], - ['ㄹ', 'ㅍ', 'ㄿ'], - ['ㄹ', 'ㅎ', 'ㅀ'], - ['ㅂ', 'ㅅ', 'ㅄ'] -]; - -const COMPLEX_VOWELS = [ - ['ㅗ', 'ㅏ', 'ㅘ'], - ['ㅗ', 'ㅐ', 'ㅙ'], - ['ㅗ', 'ㅣ', 'ㅚ'], - ['ㅜ', 'ㅓ', 'ㅝ'], - ['ㅜ', 'ㅔ', 'ㅞ'], - ['ㅜ', 'ㅣ', 'ㅟ'], - ['ㅡ', 'ㅣ', 'ㅢ'] -]; - -const makeHash = (array) => { - const hash = {0: 0}; - for (let i = 0; i < array.length; i++) { - if (array[i]) { - hash[array[i].charCodeAt(0)] = i; - } - } - return hash; -}; - -const CONSONANTS_HASH = makeHash(CONSONANTS); - -const CHO_HASH = makeHash(COMPLETE_CHO); - -const JUNG_HASH = makeHash(COMPLETE_JUNG); - -const JONG_HASH = makeHash(COMPLETE_JONG); - -const makeComplexHash = (array) => { - const hash = {}; - let code1, code2; - for (let i = 0; i < array.length; i++) { - code1 = array[i][0].charCodeAt(0); - code2 = array[i][1].charCodeAt(0); - if (typeof hash[code1] === 'undefined') { - hash[code1] = {}; - } - hash[code1][code2] = array[i][2].charCodeAt(0); - } - return hash; -}; - -const COMPLEX_CONSONANTS_HASH = makeComplexHash(COMPLEX_CONSONANTS); - -const COMPLEX_VOWELS_HASH = makeComplexHash(COMPLEX_VOWELS); - -/** - * Checks if the given character is a Korean consonant. - * @param {number} c The character to check. - * @returns {boolean} True if the character is a Korean consonant, false otherwise. - */ -function isConsonant(c) { - return typeof CONSONANTS_HASH[c] !== 'undefined'; -} - -/** - * Checks if the given character is a Korean initial consonant (cho). - * @param {number} c The character to check. - * @returns {boolean} True if the character is a Korean initial consonant, false otherwise. - */ -function isCho(c) { - return typeof CHO_HASH[c] !== 'undefined'; -} - -/** - * Checks if the given character is a Korean vowel (jung). - * @param {number} c The character to check. - * @returns {boolean} True if the character is a Korean vowel, false otherwise. - */ -function isJung(c) { - return typeof JUNG_HASH[c] !== 'undefined'; -} - -/** - * Checks if the given character is a Korean final consonant (jong). - * @param {number} c The character to check. - * @returns {boolean} True if the character is a Korean final consonant, false otherwise. - */ -function isJong(c) { - return typeof JONG_HASH[c] !== 'undefined'; -} - -/** - * Checks if the given character code represents a Hangul character. - * @param {number} charCode The character code to check. - * @returns {boolean} True if the character code represents a Hangul character, false otherwise. - */ -function isHangul(charCode) { - return HANGUL_OFFSET <= charCode && charCode <= 0xd7a3; -} - -/** - * Retrieves the indices of the initial consonant (cho), vowel (jung), and final consonant (jong) - * that make up the given Hangul character code. - * @param {number} charCode The character code of the Hangul character. - * @returns {object} An object containing the indices of cho, jung, and jong. - */ -function getHangulIndices(charCode) { - const baseCode = charCode - HANGUL_OFFSET; - return { - cho: Math.floor(baseCode / 588), - jung: Math.floor((baseCode % 588) / 28), - jong: baseCode % 28 - }; -} - -/** - * Checks if the given characters 'a' and 'b' can be combined to form a complex vowel. - * @param {number} a The character code of the first vowel. - * @param {number} b The character code of the second vowel. - * @returns {(number|boolean)} The character code of the combined complex vowel, or false if they cannot be combined. - */ -function isJungJoinable(a, b) { - return (COMPLEX_VOWELS_HASH[a] && COMPLEX_VOWELS_HASH[a][b]) ? COMPLEX_VOWELS_HASH[a][b] : false; -} - -/** - * Checks if the given characters 'a' and 'b' can be combined to form a complex final consonant. - * @param {number} a The character code of the first final consonant. - * @param {number} b The character code of the second final consonant. - * @returns {(number|boolean)} The character code of the combined complex final consonant, or false if they cannot be combined. - */ -function isJongJoinable(a, b) { - return COMPLEX_CONSONANTS_HASH[a] && COMPLEX_CONSONANTS_HASH[a][b] ? COMPLEX_CONSONANTS_HASH[a][b] : false; -} - -/** - * Disassembles a given string into an array of individual Hangul characters or character components. - * @param {string} string The string to be disassembled. - * @param {boolean} [grouped=false] Whether to group the components of each Hangul character. - * @returns {(string|string[])} An array of individual Hangul characters or character components, or a single string if `grouped` is false. - * @throws {Error} If the input string is null. - */ -export const disassemble = (string, grouped = false) => { - if (string === null) { - throw new Error('Arguments cannot be null'); - } - - string = Array.isArray(string) ? string.join('') : string; - - const result = []; - - for (const character of string) { - const charCode = character.charCodeAt(0); - - if (isHangul(charCode)) { - const {cho, jung, jong} = getHangulIndices(charCode); - - const disassembled = [CHO[cho]]; - - if (Array.isArray(JUNG[jung])) { - disassembled.push(JUNG[jung].join('')); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } else { - disassembled.push(JUNG[jung]); - } - - if (JONG[jong]) { - if (Array.isArray(JONG[jong])) { - disassembled.push(JONG[jong].join('')); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } else { - disassembled.push(JONG[jong]); - } - } - - if (grouped) { - result.push(disassembled); - } else { - result.push(...disassembled); - } - } else if (isConsonant(charCode)) { - if (isCho(charCode)) { - result.push(CHO[CHO_HASH[charCode]]); - } else { - if (Array.isArray(JONG[JONG_HASH[charCode]])) { - result.push(JONG[JONG_HASH[charCode]].join('')); - } else { - result.push(JONG[JONG_HASH[charCode]]); - } - } - } else if (isJung(charCode)) { - if (Array.isArray(JUNG[JUNG_HASH[charCode]])) { - result.push(JUNG[JUNG_HASH[charCode]].join('')); - } else { - result.push(JUNG[JUNG_HASH[charCode]]); - } - } else { - result.push(character); - } - } - return grouped ? result : result.join(''); -}; - -/** - * Assembles an array of individual Hangul characters or character components into a single string. - * @param {string} string The string containing individual Hangul characters or character components to be assembled. - * @returns {string} The assembled string. - * @throws {Error} If the input string is null. - */ -export const assemble = (string) => { - if (string === null) { - throw new Error('Arguments cannot be null'); - } - - const array = [...disassemble(string)]; - - const result = []; - - let complete_index = -1, - jong_joined = false; - - /** - * Helper function to combine jamo into hangul - * @param {number} index Index of a hangul - */ - function makeHangul(index) { - let cho, - jung1, - jung2, - jong1 = 0, - jong2, - hangul = ''; - - jong_joined = false; - - if (complete_index + 1 > index) { - return; - } - - for (let step = 1; ; step++) { - // eslint-disable-next-line unicorn/prefer-switch - if (step === 1) { - cho = array[complete_index + step].charCodeAt(0); - if (isJung(cho)) { - if (complete_index + step + 1 <= index && isJung(jung1 = array[complete_index + step + 1].charCodeAt(0))) { - result.push(String.fromCharCode(isJungJoinable(cho, jung1))); - complete_index = index; - return; - } else { - result.push(array[complete_index + step]); - complete_index = index; - return; - } - } else if (!isCho(cho)) { - result.push(array[complete_index + step]); - complete_index = index; - return; - } - hangul = array[complete_index + step]; - } else if (step === 2) { - jung1 = array[complete_index + step].charCodeAt(0); - if (isCho(jung1)) { - result.push(String.fromCharCode(isJongJoinable(cho, jung1))); - complete_index = index; - return; - } else { - hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + HANGUL_OFFSET); - } - } else if (step === 3) { - jung2 = array[complete_index + step].charCodeAt(0); - if (isJungJoinable(jung1, jung2)) { - jung1 = isJungJoinable(jung1, jung2); - } else { - jong1 = jung2; - } - hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } else if (step === 4) { - jong2 = array[complete_index + step].charCodeAt(0); - jong1 = isJongJoinable(jong1, jong2) ?? jong2; - hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } else if (step === 5) { - jong2 = array[complete_index + step].charCodeAt(0); - jong1 = isJongJoinable(jong1, jong2); - hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument - } - if (complete_index + step >= index) { - result.push(hangul); - complete_index = index; - return; - } - } - } - - let index, - stage = 0, - previousCharCode; - - for (index = 0; index < array.length; index++) { - const charCode = array[index].charCodeAt(0); - - if (!isCho(charCode) && !isJung(charCode) && !isJong(charCode)) { - makeHangul(index - 1); - makeHangul(index); - stage = 0; - continue; - } - - // eslint-disable-next-line unicorn/prefer-switch - if (stage === 0) { - if (isCho(charCode)) { - stage = 1; - } else if (isJung(charCode)) { - stage = 4; - } - } else if (stage === 1) { - if (isJung(charCode)) { - stage = 2; - } else { - if (isJongJoinable(previousCharCode, charCode)) { - stage = 5; - } else { - makeHangul(index - 1); - } - } - } else if (stage === 2) { - if (isJong(charCode)) { - stage = 3; - } else if (isJung(charCode)) { - if (!isJungJoinable(previousCharCode, charCode)) { - makeHangul(index - 1); - stage = 4; - } - } else { - makeHangul(index - 1); - stage = 1; - } - } else if (stage === 3) { - if (isJong(charCode)) { - if (!jong_joined && isJongJoinable(previousCharCode, charCode)) { - jong_joined = true; - } else { - makeHangul(index - 1); - stage = 1; - } - } else if (isCho(charCode)) { - makeHangul(index - 1); - stage = 1; - } else if (isJung(charCode)) { - makeHangul(index - 2); - stage = 2; - } - } else if (stage === 4) { - if (isJung(charCode)) { - if (isJungJoinable(previousCharCode, charCode)) { - makeHangul(index); - stage = 0; - } else { - makeHangul(index - 1); - } - } else { - makeHangul(index - 1); - stage = 1; - } - } else if (stage === 5) { - if (isJung(charCode)) { - makeHangul(index - 2); - stage = 2; - } else { - makeHangul(index - 1); - stage = 1; - } - } - previousCharCode = charCode; - } - makeHangul(index - 1); - return result.join(''); -};