From a51f1ab2dc675a49bfeeb08cc24b97eb8d888e4a Mon Sep 17 00:00:00 2001 From: toasted-nutbread Date: Sun, 28 Jan 2024 07:21:12 -0500 Subject: [PATCH 1/2] Deinflection cycle testing (#477) * Add ReasonRaw * Set up cycle checks * Use Deinflector.rulesMatch * Remove checkRules * Use Deinflector.rulesMatch * Convert to test * Rename * Rename --- test/deinflection-cycles.test.js | 165 +++++++++++++++++++++++++++++++ test/deinflector.test.js | 2 +- types/ext/deinflector.d.ts | 14 +-- 3 files changed, 174 insertions(+), 7 deletions(-) create mode 100644 test/deinflection-cycles.test.js diff --git a/test/deinflection-cycles.test.js b/test/deinflection-cycles.test.js new file mode 100644 index 0000000000..a010d7a363 --- /dev/null +++ b/test/deinflection-cycles.test.js @@ -0,0 +1,165 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {readFileSync} from 'fs'; +import {join, dirname as pathDirname} from 'path'; +import {fileURLToPath} from 'url'; +import {describe, test} from 'vitest'; +import {parseJson} from '../dev/json.js'; +import {Deinflector} from '../ext/js/language/deinflector.js'; + +class DeinflectionNode { + /** + * @param {string} text + * @param {import('deinflector').ReasonTypeRaw[]} ruleNames + * @param {?RuleNode} ruleNode + * @param {?DeinflectionNode} previous + */ + constructor(text, ruleNames, ruleNode, previous) { + /** @type {string} */ + this.text = text; + /** @type {import('deinflector').ReasonTypeRaw[]} */ + this.ruleNames = ruleNames; + /** @type {?RuleNode} */ + this.ruleNode = ruleNode; + /** @type {?DeinflectionNode} */ + this.previous = previous; + } + + /** + * @param {DeinflectionNode} other + * @returns {boolean} + */ + historyIncludes(other) { + /** @type {?DeinflectionNode} */ + // eslint-disable-next-line @typescript-eslint/no-this-alias + let node = this; + for (; node !== null; node = node.previous) { + if ( + node.ruleNode === other.ruleNode && + node.text === other.text && + arraysAreEqual(node.ruleNames, other.ruleNames) + ) { + return true; + } + } + return false; + } + + /** + * @returns {DeinflectionNode[]} + */ + getHistory() { + /** @type {DeinflectionNode[]} */ + const results = []; + /** @type {?DeinflectionNode} */ + // eslint-disable-next-line @typescript-eslint/no-this-alias + let node = this; + for (; node !== null; node = node.previous) { + results.unshift(node); + } + return results; + } +} + +class RuleNode { + /** + * @param {string} groupName + * @param {import('deinflector').ReasonRaw} rule + */ + constructor(groupName, rule) { + /** @type {string} */ + this.groupName = groupName; + /** @type {import('deinflector').ReasonRaw} */ + this.rule = rule; + } +} + +/** + * @template [T=unknown] + * @param {T[]} rules1 + * @param {T[]} rules2 + * @returns {boolean} + */ +function arraysAreEqual(rules1, rules2) { + if (rules1.length !== rules2.length) { return false; } + for (const rule1 of rules1) { + if (!rules2.includes(rule1)) { return false; } + } + return true; +} + +describe('Deinflection data', () => { + test('Check for cycles', ({expect}) => { + const dirname = pathDirname(fileURLToPath(import.meta.url)); + + /** @type {import('deinflector').ReasonsRaw} */ + const deinflectionReasons = parseJson(readFileSync(join(dirname, '../ext/data/deinflect.json'), {encoding: 'utf8'})); + + /** @type {RuleNode[]} */ + const ruleNodes = []; + for (const [groupName, reasonInfo] of Object.entries(deinflectionReasons)) { + for (const rule of reasonInfo) { + ruleNodes.push(new RuleNode(groupName, rule)); + } + } + + /** @type {DeinflectionNode[]} */ + const deinflectionNodes = []; + for (const ruleNode of ruleNodes) { + deinflectionNodes.push(new DeinflectionNode(`?${ruleNode.rule.kanaIn}`, [], null, null)); + } + for (let i = 0; i < deinflectionNodes.length; ++i) { + const deinflectionNode = deinflectionNodes[i]; + const {text, ruleNames} = deinflectionNode; + for (const ruleNode of ruleNodes) { + const {kanaIn, kanaOut, rulesIn, rulesOut} = ruleNode.rule; + if ( + !Deinflector.rulesMatch(Deinflector.rulesToRuleFlags(ruleNames), Deinflector.rulesToRuleFlags(rulesIn)) || + !text.endsWith(kanaIn) || + (text.length - kanaIn.length + kanaOut.length) <= 0 + ) { + continue; + } + + const newDeinflectionNode = new DeinflectionNode( + text.substring(0, text.length - kanaIn.length) + kanaOut, + rulesOut, + ruleNode, + deinflectionNode + ); + + // Cycle check + if (deinflectionNode.historyIncludes(newDeinflectionNode)) { + const stack = []; + for (const item of newDeinflectionNode.getHistory()) { + stack.push( + item.ruleNode === null ? + `${item.text} (start)` : + `${item.text} (${item.ruleNode.groupName}, ${item.ruleNode.rule.rulesIn.join(',')}=>${item.ruleNode.rule.rulesOut.join(',')}, ${item.ruleNode.rule.kanaIn}=>${item.ruleNode.rule.kanaOut})` + ); + } + const message = `Cycle detected:\n ${stack.join('\n ')}`; + expect.soft(true, message).toEqual(false); + continue; + } + + deinflectionNodes.push(newDeinflectionNode); + } + } + }); +}); \ No newline at end of file diff --git a/test/deinflector.test.js b/test/deinflector.test.js index 660b909a5c..69495b4c00 100644 --- a/test/deinflector.test.js +++ b/test/deinflector.test.js @@ -38,7 +38,7 @@ function hasTermReasons(deinflector, source, expectedTerm, expectedRule, expecte if (term !== expectedTerm) { continue; } if (typeof expectedRule !== 'undefined') { const expectedFlags = Deinflector.rulesToRuleFlags([expectedRule]); - if (rules !== 0 && (rules & expectedFlags) !== expectedFlags) { continue; } + if (!Deinflector.rulesMatch(rules, expectedFlags)) { continue; } } let okay = true; if (typeof expectedReasons !== 'undefined') { diff --git a/types/ext/deinflector.d.ts b/types/ext/deinflector.d.ts index 5defbf7959..18d0f04a89 100644 --- a/types/ext/deinflector.d.ts +++ b/types/ext/deinflector.d.ts @@ -20,12 +20,14 @@ import type * as TranslationInternal from './translation-internal'; export type ReasonTypeRaw = 'v1' | 'v1d' | 'v1p' | 'v5' | 'vs' | 'vk' | 'vz' | 'adj-i' | 'iru'; export type ReasonsRaw = { - [reason: string]: { - kanaIn: string; - kanaOut: string; - rulesIn: ReasonTypeRaw[]; - rulesOut: ReasonTypeRaw[]; - }[]; + [reason: string]: ReasonRaw[]; +}; + +export type ReasonRaw = { + kanaIn: string; + kanaOut: string; + rulesIn: ReasonTypeRaw[]; + rulesOut: ReasonTypeRaw[]; }; export type ReasonVariant = [ From acc013a1a8051d34322f0f5f91d7bdecc0a18843 Mon Sep 17 00:00:00 2001 From: toasted-nutbread Date: Sun, 28 Jan 2024 07:22:47 -0500 Subject: [PATCH 2/2] JapaneseUtil refactor (#555) * Copy functions from JapaneseUtil * Remove JapaneseUtil * Update usages of JapaneseUtil functions --- .eslintrc.json | 4 +- ext/js/background/backend.js | 21 +- ext/js/background/offscreen.js | 5 - ext/js/comm/clipboard-monitor.js | 9 +- ext/js/data/anki-note-builder.js | 12 +- ext/js/data/sandbox/anki-note-data-creator.js | 12 +- ext/js/display/display-anki.js | 5 +- ext/js/display/display-generator.js | 26 +- ext/js/display/display.js | 14 +- ext/js/display/popup-main.js | 7 +- ext/js/display/query-parser.js | 40 +- .../sandbox/pronunciation-generator.js | 24 +- .../sandbox/structured-content-generator.js | 9 +- ext/js/display/search-display-controller.js | 4 +- ext/js/display/search-main.js | 10 +- ext/js/language/japanese-wanakana.js | 122 +++ ext/js/language/japanese.js | 740 +++++++++++++++ ext/js/language/sandbox/japanese-util.js | 885 ------------------ ext/js/language/translator.js | 22 +- ext/js/media/audio-downloader.js | 9 +- .../settings/anki-templates-controller.js | 3 +- .../sandbox/anki-template-renderer.js | 26 +- test/fixtures/translator-test.js | 6 +- test/japanese-util.test.js | 12 +- test/utilities/anki.js | 4 +- types/ext/display.d.ts | 3 - types/ext/translator.d.ts | 3 - 27 files changed, 980 insertions(+), 1057 deletions(-) create mode 100644 ext/js/language/japanese-wanakana.js create mode 100644 ext/js/language/japanese.js delete mode 100644 ext/js/language/sandbox/japanese-util.js diff --git a/.eslintrc.json b/.eslintrc.json index d4bb3d23ff..eec25b5c36 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -613,7 +613,9 @@ "files": [ "ext/js/core.js", "ext/js/core/extension-error.js", - "ext/js/**/sandbox/**/*.js" + "ext/js/**/sandbox/**/*.js", + "ext/js/language/japanese.js", + "ext/js/language/japanese-wanakana.js" ], "env": { "webextensions": false diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js index b61f27b108..74c1370c11 100644 --- a/ext/js/background/backend.js +++ b/ext/js/background/backend.js @@ -16,7 +16,6 @@ * along with this program. If not, see . */ -import * as wanakana from '../../lib/wanakana.js'; import {AccessibilityController} from '../accessibility/accessibility-controller.js'; import {AnkiConnect} from '../comm/anki-connect.js'; import {ClipboardMonitor} from '../comm/clipboard-monitor.js'; @@ -34,7 +33,7 @@ import {ArrayBufferUtil} from '../data/sandbox/array-buffer-util.js'; import {DictionaryDatabase} from '../dictionary/dictionary-database.js'; import {Environment} from '../extension/environment.js'; import {ObjectPropertyAccessor} from '../general/object-property-accessor.js'; -import {JapaneseUtil} from '../language/sandbox/japanese-util.js'; +import {distributeFuriganaInflected, isCodePointJapanese, isStringPartiallyJapanese, convertKatakanaToHiragana as jpConvertKatakanaToHiragana} from '../language/japanese.js'; import {Translator} from '../language/translator.js'; import {AudioDownloader} from '../media/audio-downloader.js'; import {MediaUtil} from '../media/media-util.js'; @@ -54,8 +53,6 @@ export class Backend { constructor(webExtension) { /** @type {import('../extension/web-extension.js').WebExtension} */ this._webExtension = webExtension; - /** @type {JapaneseUtil} */ - this._japaneseUtil = new JapaneseUtil(wanakana); /** @type {Environment} */ this._environment = new Environment(); /** @type {AnkiConnect} */ @@ -70,7 +67,6 @@ export class Backend { this._dictionaryDatabase = new DictionaryDatabase(); /** @type {Translator|TranslatorProxy} */ this._translator = new Translator({ - japaneseUtil: this._japaneseUtil, database: this._dictionaryDatabase }); /** @type {ClipboardReader|ClipboardReaderProxy} */ @@ -93,7 +89,6 @@ export class Backend { /** @type {ClipboardMonitor} */ this._clipboardMonitor = new ClipboardMonitor({ - japaneseUtil: this._japaneseUtil, clipboardReader: this._clipboardReader }); /** @type {?import('settings').Options} */ @@ -108,7 +103,6 @@ export class Backend { this._requestBuilder = new RequestBuilder(); /** @type {AudioDownloader} */ this._audioDownloader = new AudioDownloader({ - japaneseUtil: this._japaneseUtil, requestBuilder: this._requestBuilder }); /** @type {OptionsUtil} */ @@ -852,7 +846,7 @@ export class Backend { /** @type {import('api').ApiHandler<'textHasJapaneseCharacters'>} */ _onApiTextHasJapaneseCharacters({text}) { - return this._japaneseUtil.isStringPartiallyJapanese(text); + return isStringPartiallyJapanese(text); } /** @type {import('api').ApiHandler<'getTermFrequencies'>} */ @@ -1376,7 +1370,6 @@ export class Backend { * @returns {Promise} */ async _textParseScanning(text, scanLength, optionsContext) { - const jp = this._japaneseUtil; /** @type {import('translator').FindTermsMode} */ const mode = 'simple'; const options = this._getProfileOptions(optionsContext, false); @@ -1398,13 +1391,13 @@ export class Backend { if ( dictionaryEntries.length > 0 && originalTextLength > 0 && - (originalTextLength !== character.length || jp.isCodePointJapanese(codePoint)) + (originalTextLength !== character.length || isCodePointJapanese(codePoint)) ) { previousUngroupedSegment = null; const {headwords: [{term, reading}]} = dictionaryEntries[0]; const source = text.substring(i, i + originalTextLength); const textSegments = []; - for (const {text: text2, reading: reading2} of jp.distributeFuriganaInflected(term, reading, source)) { + for (const {text: text2, reading: reading2} of distributeFuriganaInflected(term, reading, source)) { textSegments.push({text: text2, reading: reading2}); } results.push(textSegments); @@ -1427,8 +1420,6 @@ export class Backend { * @returns {Promise} */ async _textParseMecab(text) { - const jp = this._japaneseUtil; - let parseTextResults; try { parseTextResults = await this._mecab.parseText(text); @@ -1444,9 +1435,9 @@ export class Backend { for (const line of lines) { for (const {term, reading, source} of line) { const termParts = []; - for (const {text: text2, reading: reading2} of jp.distributeFuriganaInflected( + for (const {text: text2, reading: reading2} of distributeFuriganaInflected( term.length > 0 ? term : source, - jp.convertKatakanaToHiragana(reading), + jpConvertKatakanaToHiragana(reading), source )) { termParts.push({text: text2, reading: reading2}); diff --git a/ext/js/background/offscreen.js b/ext/js/background/offscreen.js index 470ea0e26d..a0f5592e55 100644 --- a/ext/js/background/offscreen.js +++ b/ext/js/background/offscreen.js @@ -16,12 +16,10 @@ * along with this program. If not, see . */ -import * as wanakana from '../../lib/wanakana.js'; import {ClipboardReader} from '../comm/clipboard-reader.js'; import {createApiMap, invokeApiMapHandler} from '../core/api-map.js'; import {ArrayBufferUtil} from '../data/sandbox/array-buffer-util.js'; import {DictionaryDatabase} from '../dictionary/dictionary-database.js'; -import {JapaneseUtil} from '../language/sandbox/japanese-util.js'; import {Translator} from '../language/translator.js'; /** @@ -33,13 +31,10 @@ export class Offscreen { * Creates a new instance. */ constructor() { - /** @type {JapaneseUtil} */ - this._japaneseUtil = new JapaneseUtil(wanakana); /** @type {DictionaryDatabase} */ this._dictionaryDatabase = new DictionaryDatabase(); /** @type {Translator} */ this._translator = new Translator({ - japaneseUtil: this._japaneseUtil, database: this._dictionaryDatabase }); /** @type {ClipboardReader} */ diff --git a/ext/js/comm/clipboard-monitor.js b/ext/js/comm/clipboard-monitor.js index a1ea33620a..a8e79a1b13 100644 --- a/ext/js/comm/clipboard-monitor.js +++ b/ext/js/comm/clipboard-monitor.js @@ -17,18 +17,17 @@ */ import {EventDispatcher} from '../core/event-dispatcher.js'; +import {isStringPartiallyJapanese} from '../language/japanese.js'; /** * @augments EventDispatcher */ export class ClipboardMonitor extends EventDispatcher { /** - * @param {{japaneseUtil: import('../language/sandbox/japanese-util.js').JapaneseUtil, clipboardReader: import('clipboard-monitor').ClipboardReaderLike}} details + * @param {{clipboardReader: import('clipboard-monitor').ClipboardReaderLike}} details */ - constructor({japaneseUtil, clipboardReader}) { + constructor({clipboardReader}) { super(); - /** @type {import('../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; /** @type {import('clipboard-monitor').ClipboardReaderLike} */ this._clipboardReader = clipboardReader; /** @type {?import('core').Timeout} */ @@ -72,7 +71,7 @@ export class ClipboardMonitor extends EventDispatcher { text !== this._previousText ) { this._previousText = text; - if (canChange && this._japaneseUtil.isStringPartiallyJapanese(text)) { + if (canChange && isStringPartiallyJapanese(text)) { this.trigger('change', {text}); } } diff --git a/ext/js/data/anki-note-builder.js b/ext/js/data/anki-note-builder.js index 48564d54d0..815e7f3f07 100644 --- a/ext/js/data/anki-note-builder.js +++ b/ext/js/data/anki-note-builder.js @@ -16,20 +16,18 @@ * along with this program. If not, see . */ -import {deferPromise} from '../core/utilities.js'; import {ExtensionError} from '../core/extension-error.js'; +import {deferPromise} from '../core/utilities.js'; +import {convertHiraganaToKatakana, convertKatakanaToHiragana} from '../language/japanese.js'; import {yomitan} from '../yomitan.js'; import {AnkiUtil} from './anki-util.js'; export class AnkiNoteBuilder { /** * Initiate an instance of AnkiNoteBuilder. - * @param {import('../language/sandbox/japanese-util.js').JapaneseUtil} japaneseUtil * @param {import('../templates/template-renderer-proxy.js').TemplateRendererProxy|import('../templates/sandbox/template-renderer.js').TemplateRenderer} templateRenderer */ - constructor(japaneseUtil, templateRenderer) { - /** @type {import('../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; + constructor(templateRenderer) { /** @type {RegExp} */ this._markerPattern = AnkiUtil.cloneFieldMarkerPattern(true); /** @type {import('../templates/template-renderer-proxy.js').TemplateRendererProxy|import('../templates/sandbox/template-renderer.js').TemplateRenderer} */ @@ -530,9 +528,9 @@ export class AnkiNoteBuilder { _convertReading(reading, readingMode) { switch (readingMode) { case 'hiragana': - return this._japaneseUtil.convertKatakanaToHiragana(reading); + return convertKatakanaToHiragana(reading); case 'katakana': - return this._japaneseUtil.convertHiraganaToKatakana(reading); + return convertHiraganaToKatakana(reading); default: return reading; } diff --git a/ext/js/data/sandbox/anki-note-data-creator.js b/ext/js/data/sandbox/anki-note-data-creator.js index 5a608cd23d..fc787a6682 100644 --- a/ext/js/data/sandbox/anki-note-data-creator.js +++ b/ext/js/data/sandbox/anki-note-data-creator.js @@ -17,21 +17,13 @@ */ import {DictionaryDataUtil} from '../../dictionary/dictionary-data-util.js'; +import {distributeFurigana} from '../../language/japanese.js'; /** * This class is used to convert the internal dictionary entry format to the * format used by Anki, for backwards compatibility. */ export class AnkiNoteDataCreator { - /** - * Creates a new instance. - * @param {import('../../language/sandbox/japanese-util.js').JapaneseUtil} japaneseUtil An instance of `JapaneseUtil`. - */ - constructor(japaneseUtil) { - /** @type {import('../../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; - } - /** * Creates a compatibility representation of the specified data. * @param {string} marker The marker that is being used for template rendering. @@ -860,7 +852,7 @@ export class AnkiNoteDataCreator { _getTermHeadwordFuriganaSegments(term, reading) { /** @type {import('anki-templates').FuriganaSegment[]} */ const result = []; - for (const {text, reading: reading2} of this._japaneseUtil.distributeFurigana(term, reading)) { + for (const {text, reading: reading2} of distributeFurigana(term, reading)) { result.push({text, furigana: reading2}); } return result; diff --git a/ext/js/display/display-anki.js b/ext/js/display/display-anki.js index c51ddfa273..5433142d1e 100644 --- a/ext/js/display/display-anki.js +++ b/ext/js/display/display-anki.js @@ -30,9 +30,8 @@ export class DisplayAnki { /** * @param {import('./display.js').Display} display * @param {import('./display-audio.js').DisplayAudio} displayAudio - * @param {import('../language/sandbox/japanese-util.js').JapaneseUtil} japaneseUtil */ - constructor(display, displayAudio, japaneseUtil) { + constructor(display, displayAudio) { /** @type {import('./display.js').Display} */ this._display = display; /** @type {import('./display-audio.js').DisplayAudio} */ @@ -42,7 +41,7 @@ export class DisplayAnki { /** @type {?string} */ this._ankiFieldTemplatesDefault = null; /** @type {AnkiNoteBuilder} */ - this._ankiNoteBuilder = new AnkiNoteBuilder(japaneseUtil, new TemplateRendererProxy()); + this._ankiNoteBuilder = new AnkiNoteBuilder(new TemplateRendererProxy()); /** @type {?import('./display-notification.js').DisplayNotification} */ this._errorNotification = null; /** @type {?EventListenerCollection} */ diff --git a/ext/js/display/display-generator.js b/ext/js/display/display-generator.js index 7bf13b77e1..eef58bb074 100644 --- a/ext/js/display/display-generator.js +++ b/ext/js/display/display-generator.js @@ -16,10 +16,11 @@ * along with this program. If not, see . */ -import {isObject} from '../core/utilities.js'; import {ExtensionError} from '../core/extension-error.js'; +import {isObject} from '../core/utilities.js'; import {DictionaryDataUtil} from '../dictionary/dictionary-data-util.js'; import {HtmlTemplateCollection} from '../dom/html-template-collection.js'; +import {distributeFurigana, getKanaMorae, getPitchCategory, isCodePointKanji, isStringPartiallyJapanese} from '../language/japanese.js'; import {yomitan} from '../yomitan.js'; import {PronunciationGenerator} from './sandbox/pronunciation-generator.js'; import {StructuredContentGenerator} from './sandbox/structured-content-generator.js'; @@ -28,9 +29,7 @@ export class DisplayGenerator { /** * @param {import('display').DisplayGeneratorConstructorDetails} details */ - constructor({japaneseUtil, contentManager, hotkeyHelpController = null}) { - /** @type {import('../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; + constructor({contentManager, hotkeyHelpController = null}) { /** @type {import('./display-content-manager.js').DisplayContentManager} */ this._contentManager = contentManager; /** @type {?import('../input/hotkey-help-controller.js').HotkeyHelpController} */ @@ -38,9 +37,9 @@ export class DisplayGenerator { /** @type {HtmlTemplateCollection} */ this._templates = new HtmlTemplateCollection(); /** @type {StructuredContentGenerator} */ - this._structuredContentGenerator = new StructuredContentGenerator(this._contentManager, japaneseUtil, document); + this._structuredContentGenerator = new StructuredContentGenerator(this._contentManager, document); /** @type {PronunciationGenerator} */ - this._pronunciationGenerator = new PronunciationGenerator(japaneseUtil); + this._pronunciationGenerator = new PronunciationGenerator(); } /** */ @@ -725,11 +724,9 @@ export class DisplayGenerator { * @returns {HTMLElement} */ _createPronunciationPitchAccent(pitchAccent, details) { - const jp = this._japaneseUtil; - const {position, nasalPositions, devoicePositions, tags} = pitchAccent; const {reading, exclusiveTerms, exclusiveReadings} = details; - const morae = jp.getKanaMorae(reading); + const morae = getKanaMorae(reading); const node = this._instantiate('pronunciation'); @@ -912,10 +909,9 @@ export class DisplayGenerator { * @param {string} text */ _appendKanjiLinks(container, text) { - const jp = this._japaneseUtil; let part = ''; for (const c of text) { - if (jp.isCodePointKanji(/** @type {number} */ (c.codePointAt(0)))) { + if (isCodePointKanji(/** @type {number} */ (c.codePointAt(0)))) { if (part.length > 0) { container.appendChild(document.createTextNode(part)); part = ''; @@ -969,7 +965,7 @@ export class DisplayGenerator { */ _appendFurigana(container, term, reading, addText) { container.lang = 'ja'; - const segments = this._japaneseUtil.distributeFurigana(term, reading); + const segments = distributeFurigana(term, reading); for (const {text, reading: furigana} of segments) { if (furigana) { const ruby = document.createElement('ruby'); @@ -1000,7 +996,7 @@ export class DisplayGenerator { _setTextContent(node, value, language) { if (typeof language === 'string') { node.lang = language; - } else if (this._japaneseUtil.isStringPartiallyJapanese(value)) { + } else if (isStringPartiallyJapanese(value)) { node.lang = 'ja'; } @@ -1017,7 +1013,7 @@ export class DisplayGenerator { // cause the text to not copy correctly. if (typeof language === 'string') { node.lang = language; - } else if (this._japaneseUtil.isStringPartiallyJapanese(value)) { + } else if (isStringPartiallyJapanese(value)) { node.lang = 'ja'; } @@ -1051,7 +1047,7 @@ export class DisplayGenerator { if (termPronunciation.headwordIndex !== headwordIndex) { continue; } for (const pronunciation of termPronunciation.pronunciations) { if (pronunciation.type !== 'pitch-accent') { continue; } - const category = this._japaneseUtil.getPitchCategory(reading, pronunciation.position, isVerbOrAdjective); + const category = getPitchCategory(reading, pronunciation.position, isVerbOrAdjective); if (category !== null) { categories.add(category); } diff --git a/ext/js/display/display.js b/ext/js/display/display.js index 689481f446..cff8730928 100644 --- a/ext/js/display/display.js +++ b/ext/js/display/display.js @@ -48,11 +48,10 @@ export class Display extends EventDispatcher { * @param {number|undefined} tabId * @param {number|undefined} frameId * @param {import('display').DisplayPageType} pageType - * @param {import('../language/sandbox/japanese-util.js').JapaneseUtil} japaneseUtil * @param {import('../dom/document-focus-controller.js').DocumentFocusController} documentFocusController * @param {import('../input/hotkey-handler.js').HotkeyHandler} hotkeyHandler */ - constructor(tabId, frameId, pageType, japaneseUtil, documentFocusController, hotkeyHandler) { + constructor(tabId, frameId, pageType, documentFocusController, hotkeyHandler) { super(); /** @type {number|undefined} */ this._tabId = tabId; @@ -60,8 +59,6 @@ export class Display extends EventDispatcher { this._frameId = frameId; /** @type {import('display').DisplayPageType} */ this._pageType = pageType; - /** @type {import('../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; /** @type {import('../dom/document-focus-controller.js').DocumentFocusController} */ this._documentFocusController = documentFocusController; /** @type {import('../input/hotkey-handler.js').HotkeyHandler} */ @@ -90,7 +87,6 @@ export class Display extends EventDispatcher { this._hotkeyHelpController = new HotkeyHelpController(); /** @type {DisplayGenerator} */ this._displayGenerator = new DisplayGenerator({ - japaneseUtil, contentManager: this._contentManager, hotkeyHelpController: this._hotkeyHelpController }); @@ -132,8 +128,7 @@ export class Display extends EventDispatcher { this._queryParserContainer = querySelectorNotNull(document, '#query-parser-container'); /** @type {QueryParser} */ this._queryParser = new QueryParser({ - getSearchContext: this._getSearchContext.bind(this), - japaneseUtil + getSearchContext: this._getSearchContext.bind(this) }); /** @type {HTMLElement} */ this._contentScrollElement = querySelectorNotNull(document, '#content-scroll'); @@ -240,11 +235,6 @@ export class Display extends EventDispatcher { this._updateQueryParser(); } - /** @type {import('../language/sandbox/japanese-util.js').JapaneseUtil} */ - get japaneseUtil() { - return this._japaneseUtil; - } - /** @type {number} */ get depth() { return this._depth; diff --git a/ext/js/display/popup-main.js b/ext/js/display/popup-main.js index d4f622f2d5..870e039e63 100644 --- a/ext/js/display/popup-main.js +++ b/ext/js/display/popup-main.js @@ -19,7 +19,6 @@ import {log} from '../core/logger.js'; import {DocumentFocusController} from '../dom/document-focus-controller.js'; import {HotkeyHandler} from '../input/hotkey-handler.js'; -import {JapaneseUtil} from '../language/sandbox/japanese-util.js'; import {yomitan} from '../yomitan.js'; import {DisplayAnki} from './display-anki.js'; import {DisplayAudio} from './display-audio.js'; @@ -37,18 +36,16 @@ async function main() { const {tabId, frameId} = await yomitan.api.frameInformationGet(); - const japaneseUtil = new JapaneseUtil(null); - const hotkeyHandler = new HotkeyHandler(); hotkeyHandler.prepare(); - const display = new Display(tabId, frameId, 'popup', japaneseUtil, documentFocusController, hotkeyHandler); + const display = new Display(tabId, frameId, 'popup', documentFocusController, hotkeyHandler); await display.prepare(); const displayAudio = new DisplayAudio(display); displayAudio.prepare(); - const displayAnki = new DisplayAnki(display, displayAudio, japaneseUtil); + const displayAnki = new DisplayAnki(display, displayAudio); displayAnki.prepare(); const displayProfileSelection = new DisplayProfileSelection(display); diff --git a/ext/js/display/query-parser.js b/ext/js/display/query-parser.js index e129e1be13..eb053f3857 100644 --- a/ext/js/display/query-parser.js +++ b/ext/js/display/query-parser.js @@ -19,6 +19,7 @@ import {EventDispatcher} from '../core/event-dispatcher.js'; import {log} from '../core/logger.js'; import {querySelectorNotNull} from '../dom/query-selector.js'; +import {convertHiraganaToKatakana, convertKatakanaToHiragana, isStringEntirelyKana} from '../language/japanese.js'; import {TextScanner} from '../language/text-scanner.js'; import {yomitan} from '../yomitan.js'; @@ -29,12 +30,10 @@ export class QueryParser extends EventDispatcher { /** * @param {import('display').QueryParserConstructorDetails} details */ - constructor({getSearchContext, japaneseUtil}) { + constructor({getSearchContext}) { super(); /** @type {import('display').GetSearchContextCallback} */ this._getSearchContext = getSearchContext; - /** @type {import('../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; /** @type {string} */ this._text = ''; /** @type {?import('core').TokenObject} */ @@ -65,6 +64,10 @@ export class QueryParser extends EventDispatcher { searchKanji: false, searchOnClick: true }); + /** @type {?(import('../language/japanese-wanakana.js'))} */ + this._japaneseWanakanaModule = null; + /** @type {?Promise} */ + this._japaneseWanakanaModuleImport = null; } /** @type {string} */ @@ -93,7 +96,7 @@ export class QueryParser extends EventDispatcher { this._queryParser.dataset.termSpacing = `${termSpacing}`; } if (typeof readingMode === 'string') { - this._readingMode = readingMode; + this._setReadingMode(readingMode); } if (typeof useInternalParser === 'boolean') { this._useInternalParser = useInternalParser; @@ -346,15 +349,15 @@ export class QueryParser extends EventDispatcher { _convertReading(term, reading) { switch (this._readingMode) { case 'hiragana': - return this._japaneseUtil.convertKatakanaToHiragana(reading); + return convertKatakanaToHiragana(reading); case 'katakana': - return this._japaneseUtil.convertHiraganaToKatakana(reading); + return convertHiraganaToKatakana(reading); case 'romaji': - if (this._japaneseUtil.convertToRomajiSupported()) { + if (this._japaneseWanakanaModule !== null) { if (reading.length > 0) { - return this._japaneseUtil.convertToRomaji(reading); - } else if (this._japaneseUtil.isStringEntirelyKana(term)) { - return this._japaneseUtil.convertToRomaji(term); + return this._japaneseWanakanaModule.convertToRomaji(reading); + } else if (isStringEntirelyKana(term)) { + return this._japaneseWanakanaModule.convertToRomaji(term); } } return reading; @@ -398,4 +401,21 @@ export class QueryParser extends EventDispatcher { node = node.parentNode; } } + + /** + * @param {import('settings').ParsingReadingMode} value + */ + _setReadingMode(value) { + this._readingMode = value; + if (value === 'romaji') { + this._loadJapaneseWanakanaModule(); + } + } + + /** */ + _loadJapaneseWanakanaModule() { + if (this._japaneseWanakanaModuleImport !== null) { return; } + this._japaneseWanakanaModuleImport = import('../language/japanese-wanakana.js'); + this._japaneseWanakanaModuleImport.then((value) => { this._japaneseWanakanaModule = value; }); + } } diff --git a/ext/js/display/sandbox/pronunciation-generator.js b/ext/js/display/sandbox/pronunciation-generator.js index cfcf82a10d..45631e740b 100644 --- a/ext/js/display/sandbox/pronunciation-generator.js +++ b/ext/js/display/sandbox/pronunciation-generator.js @@ -16,15 +16,9 @@ * along with this program. If not, see . */ -export class PronunciationGenerator { - /** - * @param {import('../../language/sandbox/japanese-util.js').JapaneseUtil} japaneseUtil - */ - constructor(japaneseUtil) { - /** @type {import('../../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; - } +import {getKanaDiacriticInfo, isMoraPitchHigh} from '../../language/japanese.js'; +export class PronunciationGenerator { /** * @param {string[]} morae * @param {number} downstepPosition @@ -33,7 +27,6 @@ export class PronunciationGenerator { * @returns {HTMLSpanElement} */ createPronunciationText(morae, downstepPosition, nasalPositions, devoicePositions) { - const jp = this._japaneseUtil; const nasalPositionsSet = nasalPositions.length > 0 ? new Set(nasalPositions) : null; const devoicePositionsSet = devoicePositions.length > 0 ? new Set(devoicePositions) : null; const container = document.createElement('span'); @@ -41,8 +34,8 @@ export class PronunciationGenerator { for (let i = 0, ii = morae.length; i < ii; ++i) { const i1 = i + 1; const mora = morae[i]; - const highPitch = jp.isMoraPitchHigh(i, downstepPosition); - const highPitchNext = jp.isMoraPitchHigh(i1, downstepPosition); + const highPitch = isMoraPitchHigh(i, downstepPosition); + const highPitchNext = isMoraPitchHigh(i1, downstepPosition); const nasal = nasalPositionsSet !== null && nasalPositionsSet.has(i1); const devoice = devoicePositionsSet !== null && devoicePositionsSet.has(i1); @@ -76,7 +69,7 @@ export class PronunciationGenerator { const n2 = characterNodes[0]; const character = /** @type {string} */ (n2.textContent); - const characterInfo = jp.getKanaDiacriticInfo(character); + const characterInfo = getKanaDiacriticInfo(character); if (characterInfo !== null) { n1.dataset.originalText = mora; n2.dataset.originalText = character; @@ -111,7 +104,6 @@ export class PronunciationGenerator { * @returns {SVGSVGElement} */ createPronunciationGraph(morae, downstepPosition) { - const jp = this._japaneseUtil; const ii = morae.length; const svgns = 'http://www.w3.org/2000/svg'; @@ -131,8 +123,8 @@ export class PronunciationGenerator { const pathPoints = []; for (let i = 0; i < ii; ++i) { - const highPitch = jp.isMoraPitchHigh(i, downstepPosition); - const highPitchNext = jp.isMoraPitchHigh(i + 1, downstepPosition); + const highPitch = isMoraPitchHigh(i, downstepPosition); + const highPitchNext = isMoraPitchHigh(i + 1, downstepPosition); const x = i * 50 + 25; const y = highPitch ? 25 : 75; if (highPitch && !highPitchNext) { @@ -148,7 +140,7 @@ export class PronunciationGenerator { pathPoints.splice(0, ii - 1); { - const highPitch = jp.isMoraPitchHigh(ii, downstepPosition); + const highPitch = isMoraPitchHigh(ii, downstepPosition); const x = ii * 50 + 25; const y = highPitch ? 25 : 75; this._addGraphTriangle(svg, svgns, x, y); diff --git a/ext/js/display/sandbox/structured-content-generator.js b/ext/js/display/sandbox/structured-content-generator.js index ee86a7f4cb..60bf0ee51c 100644 --- a/ext/js/display/sandbox/structured-content-generator.js +++ b/ext/js/display/sandbox/structured-content-generator.js @@ -16,17 +16,16 @@ * along with this program. If not, see . */ +import {isStringPartiallyJapanese} from '../../language/japanese.js'; + export class StructuredContentGenerator { /** * @param {import('../../display/display-content-manager.js').DisplayContentManager|import('../../templates/sandbox/anki-template-renderer-content-manager.js').AnkiTemplateRendererContentManager} contentManager - * @param {import('../../language/sandbox/japanese-util.js').JapaneseUtil} japaneseUtil * @param {Document} document */ - constructor(contentManager, japaneseUtil, document) { + constructor(contentManager, document) { /** @type {import('../../display/display-content-manager.js').DisplayContentManager|import('../../templates/sandbox/anki-template-renderer-content-manager.js').AnkiTemplateRendererContentManager} */ this._contentManager = contentManager; - /** @type {import('../../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; /** @type {Document} */ this._document = document; } @@ -164,7 +163,7 @@ export class StructuredContentGenerator { if (typeof content === 'string') { if (content.length > 0) { container.appendChild(this._createTextNode(content)); - if (language === null && this._japaneseUtil.isStringPartiallyJapanese(content)) { + if (language === null && isStringPartiallyJapanese(content)) { container.lang = 'ja'; } } diff --git a/ext/js/display/search-display-controller.js b/ext/js/display/search-display-controller.js index 594a80aade..ff4340c190 100644 --- a/ext/js/display/search-display-controller.js +++ b/ext/js/display/search-display-controller.js @@ -29,10 +29,9 @@ export class SearchDisplayController { * @param {number|undefined} frameId * @param {import('./display.js').Display} display * @param {import('./display-audio.js').DisplayAudio} displayAudio - * @param {import('../language/sandbox/japanese-util.js').JapaneseUtil} japaneseUtil * @param {import('./search-persistent-state-controller.js').SearchPersistentStateController} searchPersistentStateController */ - constructor(tabId, frameId, display, displayAudio, japaneseUtil, searchPersistentStateController) { + constructor(tabId, frameId, display, displayAudio, searchPersistentStateController) { /** @type {number|undefined} */ this._tabId = tabId; /** @type {number|undefined} */ @@ -71,7 +70,6 @@ export class SearchDisplayController { this._clipboardMonitorEnabled = false; /** @type {ClipboardMonitor} */ this._clipboardMonitor = new ClipboardMonitor({ - japaneseUtil, clipboardReader: { getText: yomitan.api.clipboardGet.bind(yomitan.api) } diff --git a/ext/js/display/search-main.js b/ext/js/display/search-main.js index 3cdd1f2535..dedad1636a 100644 --- a/ext/js/display/search-main.js +++ b/ext/js/display/search-main.js @@ -16,11 +16,9 @@ * along with this program. If not, see . */ -import * as wanakana from '../../lib/wanakana.js'; import {log} from '../core/logger.js'; import {DocumentFocusController} from '../dom/document-focus-controller.js'; import {HotkeyHandler} from '../input/hotkey-handler.js'; -import {JapaneseUtil} from '../language/sandbox/japanese-util.js'; import {yomitan} from '../yomitan.js'; import {DisplayAnki} from './display-anki.js'; import {DisplayAudio} from './display-audio.js'; @@ -45,21 +43,19 @@ async function main() { const {tabId, frameId} = await yomitan.api.frameInformationGet(); - const japaneseUtil = new JapaneseUtil(wanakana); - const hotkeyHandler = new HotkeyHandler(); hotkeyHandler.prepare(); - const display = new Display(tabId, frameId, 'search', japaneseUtil, documentFocusController, hotkeyHandler); + const display = new Display(tabId, frameId, 'search', documentFocusController, hotkeyHandler); await display.prepare(); const displayAudio = new DisplayAudio(display); displayAudio.prepare(); - const displayAnki = new DisplayAnki(display, displayAudio, japaneseUtil); + const displayAnki = new DisplayAnki(display, displayAudio); displayAnki.prepare(); - const searchDisplayController = new SearchDisplayController(tabId, frameId, display, displayAudio, japaneseUtil, searchPersistentStateController); + const searchDisplayController = new SearchDisplayController(tabId, frameId, display, displayAudio, searchPersistentStateController); await searchDisplayController.prepare(); display.initializeState(); diff --git a/ext/js/language/japanese-wanakana.js b/ext/js/language/japanese-wanakana.js new file mode 100644 index 0000000000..b48ab6d6a0 --- /dev/null +++ b/ext/js/language/japanese-wanakana.js @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import * as wanakana from '../../lib/wanakana.js'; + +/** + * @param {string} text + * @param {?import('../general/text-source-map.js').TextSourceMap} sourceMap + * @param {number} sourceMapStart + * @returns {string} + */ +function convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) { + const result = wanakana.toHiragana(text); + + // Generate source mapping + if (sourceMap !== null) { + let i = 0; + let resultPos = 0; + const ii = text.length; + while (i < ii) { + // Find smallest matching substring + let iNext = i + 1; + let resultPosNext = result.length; + while (iNext < ii) { + const t = wanakana.toHiragana(text.substring(0, iNext)); + if (t === result.substring(0, t.length)) { + resultPosNext = t.length; + break; + } + ++iNext; + } + + // Merge characters + const removals = iNext - i - 1; + if (removals > 0) { + sourceMap.combine(sourceMapStart, removals); + } + ++sourceMapStart; + + // Empty elements + const additions = resultPosNext - resultPos - 1; + for (let j = 0; j < additions; ++j) { + sourceMap.insert(sourceMapStart, 0); + ++sourceMapStart; + } + + i = iNext; + resultPos = resultPosNext; + } + } + + return result; +} + +/** + * @param {string} text + * @returns {string} + */ +export function convertToKana(text) { + return wanakana.toKana(text); +} + +/** + * @param {string} text + * @returns {string} + */ +export function convertToRomaji(text) { + return wanakana.toRomaji(text); +} + +/** + * @param {string} text + * @param {?import('../general/text-source-map.js').TextSourceMap} sourceMap + * @returns {string} + */ +export function convertAlphabeticToKana(text, sourceMap = null) { + let part = ''; + let result = ''; + + for (const char of text) { + // Note: 0x61 is the character code for 'a' + let c = /** @type {number} */ (char.codePointAt(0)); + if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z'] + c += (0x61 - 0x41); + } else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z'] + // NOP; c += (0x61 - 0x61); + } else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z'] fullwidth + c += (0x61 - 0xff21); + } else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z'] fullwidth + c += (0x61 - 0xff41); + } else if (c === 0x2d || c === 0xff0d) { // '-' or fullwidth dash + c = 0x2d; // '-' + } else { + if (part.length > 0) { + result += convertAlphabeticPartToKana(part, sourceMap, result.length); + part = ''; + } + result += char; + continue; + } + part += String.fromCodePoint(c); + } + + if (part.length > 0) { + result += convertAlphabeticPartToKana(part, sourceMap, result.length); + } + return result; +} diff --git a/ext/js/language/japanese.js b/ext/js/language/japanese.js new file mode 100644 index 0000000000..88eb5af589 --- /dev/null +++ b/ext/js/language/japanese.js @@ -0,0 +1,740 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; +const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; +const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5; +const KATAKANA_SMALL_KE_CODE_POINT = 0x30f6; +const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc; + +/** @type {import('japanese-util').CodepointRange} */ +const HIRAGANA_RANGE = [0x3040, 0x309f]; +/** @type {import('japanese-util').CodepointRange} */ +const KATAKANA_RANGE = [0x30a0, 0x30ff]; + +/** @type {import('japanese-util').CodepointRange} */ +const HIRAGANA_CONVERSION_RANGE = [0x3041, 0x3096]; +/** @type {import('japanese-util').CodepointRange} */ +const KATAKANA_CONVERSION_RANGE = [0x30a1, 0x30f6]; + +/** @type {import('japanese-util').CodepointRange[]} */ +const KANA_RANGES = [HIRAGANA_RANGE, KATAKANA_RANGE]; + +/** @type {import('japanese-util').CodepointRange} */ +const CJK_UNIFIED_IDEOGRAPHS_RANGE = [0x4e00, 0x9fff]; +/** @type {import('japanese-util').CodepointRange} */ +const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE = [0x3400, 0x4dbf]; +/** @type {import('japanese-util').CodepointRange} */ +const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE = [0x20000, 0x2a6df]; +/** @type {import('japanese-util').CodepointRange} */ +const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_RANGE = [0x2a700, 0x2b73f]; +/** @type {import('japanese-util').CodepointRange} */ +const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f]; +/** @type {import('japanese-util').CodepointRange} */ +const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf]; +/** @type {import('japanese-util').CodepointRange} */ +const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef]; +/** @type {import('japanese-util').CodepointRange} */ +const CJK_COMPATIBILITY_IDEOGRAPHS_RANGE = [0xf900, 0xfaff]; +/** @type {import('japanese-util').CodepointRange} */ +const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f]; +/** @type {import('japanese-util').CodepointRange[]} */ +const CJK_IDEOGRAPH_RANGES = [ + CJK_UNIFIED_IDEOGRAPHS_RANGE, + CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE, + CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE, + CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_RANGE, + CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE, + CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE, + CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE, + CJK_COMPATIBILITY_IDEOGRAPHS_RANGE, + CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE +]; + +/** + * Japanese character ranges, roughly ordered in order of expected frequency. + * @type {import('japanese-util').CodepointRange[]} + */ +const JAPANESE_RANGES = [ + HIRAGANA_RANGE, + KATAKANA_RANGE, + + ...CJK_IDEOGRAPH_RANGES, + + [0xff66, 0xff9f], // Halfwidth katakana + + [0x30fb, 0x30fc], // Katakana punctuation + [0xff61, 0xff65], // Kana punctuation + [0x3000, 0x303f], // CJK punctuation + + [0xff10, 0xff19], // Fullwidth numbers + [0xff21, 0xff3a], // Fullwidth upper case Latin letters + [0xff41, 0xff5a], // Fullwidth lower case Latin letters + + [0xff01, 0xff0f], // Fullwidth punctuation 1 + [0xff1a, 0xff1f], // Fullwidth punctuation 2 + [0xff3b, 0xff3f], // Fullwidth punctuation 3 + [0xff5b, 0xff60], // Fullwidth punctuation 4 + [0xffe0, 0xffee] // Currency markers +]; + +const SMALL_KANA_SET = new Set(Array.from('ぁぃぅぇぉゃゅょゎァィゥェォャュョヮ')); + +const HALFWIDTH_KATAKANA_MAPPING = new Map([ + ['ヲ', 'ヲヺ-'], + ['ァ', 'ァ--'], + ['ィ', 'ィ--'], + ['ゥ', 'ゥ--'], + ['ェ', 'ェ--'], + ['ォ', 'ォ--'], + ['ャ', 'ャ--'], + ['ュ', 'ュ--'], + ['ョ', 'ョ--'], + ['ッ', 'ッ--'], + ['ー', 'ー--'], + ['ア', 'ア--'], + ['イ', 'イ--'], + ['ウ', 'ウヴ-'], + ['エ', 'エ--'], + ['オ', 'オ--'], + ['カ', 'カガ-'], + ['キ', 'キギ-'], + ['ク', 'クグ-'], + ['ケ', 'ケゲ-'], + ['コ', 'コゴ-'], + ['サ', 'サザ-'], + ['シ', 'シジ-'], + ['ス', 'スズ-'], + ['セ', 'セゼ-'], + ['ソ', 'ソゾ-'], + ['タ', 'タダ-'], + ['チ', 'チヂ-'], + ['ツ', 'ツヅ-'], + ['テ', 'テデ-'], + ['ト', 'トド-'], + ['ナ', 'ナ--'], + ['ニ', 'ニ--'], + ['ヌ', 'ヌ--'], + ['ネ', 'ネ--'], + ['ノ', 'ノ--'], + ['ハ', 'ハバパ'], + ['ヒ', 'ヒビピ'], + ['フ', 'フブプ'], + ['ヘ', 'ヘベペ'], + ['ホ', 'ホボポ'], + ['マ', 'マ--'], + ['ミ', 'ミ--'], + ['ム', 'ム--'], + ['メ', 'メ--'], + ['モ', 'モ--'], + ['ヤ', 'ヤ--'], + ['ユ', 'ユ--'], + ['ヨ', 'ヨ--'], + ['ラ', 'ラ--'], + ['リ', 'リ--'], + ['ル', 'ル--'], + ['レ', 'レ--'], + ['ロ', 'ロ--'], + ['ワ', 'ワ--'], + ['ン', 'ン--'] +]); + +const VOWEL_TO_KANA_MAPPING = new Map([ + ['a', 'ぁあかがさざただなはばぱまゃやらゎわヵァアカガサザタダナハバパマャヤラヮワヵヷ'], + ['i', 'ぃいきぎしじちぢにひびぴみりゐィイキギシジチヂニヒビピミリヰヸ'], + ['u', 'ぅうくぐすずっつづぬふぶぷむゅゆるゥウクグスズッツヅヌフブプムュユルヴ'], + ['e', 'ぇえけげせぜてでねへべぺめれゑヶェエケゲセゼテデネヘベペメレヱヶヹ'], + ['o', 'ぉおこごそぞとどのほぼぽもょよろをォオコゴソゾトドノホボポモョヨロヲヺ'], + ['', 'のノ'] +]); + +/** @type {Map} */ +const KANA_TO_VOWEL_MAPPING = new Map(); +for (const [vowel, characters] of VOWEL_TO_KANA_MAPPING) { + for (const character of characters) { + KANA_TO_VOWEL_MAPPING.set(character, vowel); + } +} + +const kana = 'うゔ-かが-きぎ-くぐ-けげ-こご-さざ-しじ-すず-せぜ-そぞ-ただ-ちぢ-つづ-てで-とど-はばぱひびぴふぶぷへべぺほぼぽワヷ-ヰヸ-ウヴ-ヱヹ-ヲヺ-カガ-キギ-クグ-ケゲ-コゴ-サザ-シジ-スズ-セゼ-ソゾ-タダ-チヂ-ツヅ-テデ-トド-ハバパヒビピフブプヘベペホボポ'; +/** @type {Map} */ +const DIACRITIC_MAPPING = new Map(); +for (let i = 0, ii = kana.length; i < ii; i += 3) { + const character = kana[i]; + const dakuten = kana[i + 1]; + const handakuten = kana[i + 2]; + DIACRITIC_MAPPING.set(dakuten, {character, type: 'dakuten'}); + if (handakuten !== '-') { + DIACRITIC_MAPPING.set(handakuten, {character, type: 'handakuten'}); + } +} + + +/** + * @param {number} codePoint + * @param {import('japanese-util').CodepointRange} range + * @returns {boolean} + */ +function isCodePointInRange(codePoint, [min, max]) { + return (codePoint >= min && codePoint <= max); +} + +/** + * @param {number} codePoint + * @param {import('japanese-util').CodepointRange[]} ranges + * @returns {boolean} + */ +function isCodePointInRanges(codePoint, ranges) { + for (const [min, max] of ranges) { + if (codePoint >= min && codePoint <= max) { + return true; + } + } + return false; +} + +/** + * @param {string} previousCharacter + * @returns {?string} + */ +function getProlongedHiragana(previousCharacter) { + switch (KANA_TO_VOWEL_MAPPING.get(previousCharacter)) { + case 'a': return 'あ'; + case 'i': return 'い'; + case 'u': return 'う'; + case 'e': return 'え'; + case 'o': return 'う'; + default: return null; + } +} + +/** + * @param {string} text + * @param {string} reading + * @returns {import('japanese-util').FuriganaSegment} + */ +function createFuriganaSegment(text, reading) { + return {text, reading}; +} + +/** + * @param {string} reading + * @param {string} readingNormalized + * @param {import('japanese-util').FuriganaGroup[]} groups + * @param {number} groupsStart + * @returns {?(import('japanese-util').FuriganaSegment[])} + */ +function segmentizeFurigana(reading, readingNormalized, groups, groupsStart) { + const groupCount = groups.length - groupsStart; + if (groupCount <= 0) { + return reading.length === 0 ? [] : null; + } + + const group = groups[groupsStart]; + const {isKana, text} = group; + const textLength = text.length; + if (isKana) { + const {textNormalized} = group; + if (textNormalized !== null && readingNormalized.startsWith(textNormalized)) { + const segments = segmentizeFurigana( + reading.substring(textLength), + readingNormalized.substring(textLength), + groups, + groupsStart + 1 + ); + if (segments !== null) { + if (reading.startsWith(text)) { + segments.unshift(createFuriganaSegment(text, '')); + } else { + segments.unshift(...getFuriganaKanaSegments(text, reading)); + } + return segments; + } + } + return null; + } else { + let result = null; + for (let i = reading.length; i >= textLength; --i) { + const segments = segmentizeFurigana( + reading.substring(i), + readingNormalized.substring(i), + groups, + groupsStart + 1 + ); + if (segments !== null) { + if (result !== null) { + // More than one way to segmentize the tail; mark as ambiguous + return null; + } + const segmentReading = reading.substring(0, i); + segments.unshift(createFuriganaSegment(text, segmentReading)); + result = segments; + } + // There is only one way to segmentize the last non-kana group + if (groupCount === 1) { + break; + } + } + return result; + } +} + +/** + * @param {string} text + * @param {string} reading + * @returns {import('japanese-util').FuriganaSegment[]} + */ +function getFuriganaKanaSegments(text, reading) { + const textLength = text.length; + const newSegments = []; + let start = 0; + let state = (reading[0] === text[0]); + for (let i = 1; i < textLength; ++i) { + const newState = (reading[i] === text[i]); + if (state === newState) { continue; } + newSegments.push(createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i))); + state = newState; + start = i; + } + newSegments.push(createFuriganaSegment(text.substring(start, textLength), state ? '' : reading.substring(start, textLength))); + return newSegments; +} + +/** + * @param {string} text1 + * @param {string} text2 + * @returns {number} + */ +function getStemLength(text1, text2) { + const minLength = Math.min(text1.length, text2.length); + if (minLength === 0) { return 0; } + + let i = 0; + while (true) { + const char1 = /** @type {number} */ (text1.codePointAt(i)); + const char2 = /** @type {number} */ (text2.codePointAt(i)); + if (char1 !== char2) { break; } + const charLength = String.fromCodePoint(char1).length; + i += charLength; + if (i >= minLength) { + if (i > minLength) { + i -= charLength; // Don't consume partial UTF16 surrogate characters + } + break; + } + } + return i; +} + + +// Character code testing functions + +/** + * @param {number} codePoint + * @returns {boolean} + */ +export function isCodePointKanji(codePoint) { + return isCodePointInRanges(codePoint, CJK_IDEOGRAPH_RANGES); +} + +/** + * @param {number} codePoint + * @returns {boolean} + */ +export function isCodePointKana(codePoint) { + return isCodePointInRanges(codePoint, KANA_RANGES); +} + +/** + * @param {number} codePoint + * @returns {boolean} + */ +export function isCodePointJapanese(codePoint) { + return isCodePointInRanges(codePoint, JAPANESE_RANGES); +} + + +// String testing functions + +/** + * @param {string} str + * @returns {boolean} + */ +export function isStringEntirelyKana(str) { + if (str.length === 0) { return false; } + for (const c of str) { + if (!isCodePointInRanges(/** @type {number} */ (c.codePointAt(0)), KANA_RANGES)) { + return false; + } + } + return true; +} + +/** + * @param {string} str + * @returns {boolean} + */ +export function isStringPartiallyJapanese(str) { + if (str.length === 0) { return false; } + for (const c of str) { + if (isCodePointInRanges(/** @type {number} */ (c.codePointAt(0)), JAPANESE_RANGES)) { + return true; + } + } + return false; +} + + +// Mora functions + +/** + * @param {number} moraIndex + * @param {number} pitchAccentDownstepPosition + * @returns {boolean} + */ +export function isMoraPitchHigh(moraIndex, pitchAccentDownstepPosition) { + switch (pitchAccentDownstepPosition) { + case 0: return (moraIndex > 0); + case 1: return (moraIndex < 1); + default: return (moraIndex > 0 && moraIndex < pitchAccentDownstepPosition); + } +} + +/** + * @param {string} text + * @param {number} pitchAccentDownstepPosition + * @param {boolean} isVerbOrAdjective + * @returns {?import('japanese-util').PitchCategory} + */ +export function getPitchCategory(text, pitchAccentDownstepPosition, isVerbOrAdjective) { + if (pitchAccentDownstepPosition === 0) { + return 'heiban'; + } + if (isVerbOrAdjective) { + return pitchAccentDownstepPosition > 0 ? 'kifuku' : null; + } + if (pitchAccentDownstepPosition === 1) { + return 'atamadaka'; + } + if (pitchAccentDownstepPosition > 1) { + return pitchAccentDownstepPosition >= getKanaMoraCount(text) ? 'odaka' : 'nakadaka'; + } + return null; +} + +/** + * @param {string} text + * @returns {string[]} + */ +export function getKanaMorae(text) { + const morae = []; + let i; + for (const c of text) { + if (SMALL_KANA_SET.has(c) && (i = morae.length) > 0) { + morae[i - 1] += c; + } else { + morae.push(c); + } + } + return morae; +} + +/** + * @param {string} text + * @returns {number} + */ +export function getKanaMoraCount(text) { + let moraCount = 0; + for (const c of text) { + if (!(SMALL_KANA_SET.has(c) && moraCount > 0)) { + ++moraCount; + } + } + return moraCount; +} + + +// Conversion functions + +/** + * @param {string} text + * @param {boolean} [keepProlongedSoundMarks] + * @returns {string} + */ +export function convertKatakanaToHiragana(text, keepProlongedSoundMarks = false) { + let result = ''; + const offset = (HIRAGANA_CONVERSION_RANGE[0] - KATAKANA_CONVERSION_RANGE[0]); + for (let char of text) { + const codePoint = /** @type {number} */ (char.codePointAt(0)); + switch (codePoint) { + case KATAKANA_SMALL_KA_CODE_POINT: + case KATAKANA_SMALL_KE_CODE_POINT: + // No change + break; + case KANA_PROLONGED_SOUND_MARK_CODE_POINT: + if (!keepProlongedSoundMarks && result.length > 0) { + const char2 = getProlongedHiragana(result[result.length - 1]); + if (char2 !== null) { char = char2; } + } + break; + default: + if (isCodePointInRange(codePoint, KATAKANA_CONVERSION_RANGE)) { + char = String.fromCodePoint(codePoint + offset); + } + break; + } + result += char; + } + return result; +} + +/** + * @param {string} text + * @returns {string} + */ +export function convertHiraganaToKatakana(text) { + let result = ''; + const offset = (KATAKANA_CONVERSION_RANGE[0] - HIRAGANA_CONVERSION_RANGE[0]); + for (let char of text) { + const codePoint = /** @type {number} */ (char.codePointAt(0)); + if (isCodePointInRange(codePoint, HIRAGANA_CONVERSION_RANGE)) { + char = String.fromCodePoint(codePoint + offset); + } + result += char; + } + return result; +} + +/** + * @param {string} text + * @returns {string} + */ +export function convertNumericToFullWidth(text) { + let result = ''; + for (const char of text) { + let c = /** @type {number} */ (char.codePointAt(0)); + if (c >= 0x30 && c <= 0x39) { // ['0', '9'] + c += 0xff10 - 0x30; // 0xff10 = '0' full width + result += String.fromCodePoint(c); + } else { + result += char; + } + } + return result; +} + +/** + * @param {string} text + * @param {?import('../general/text-source-map.js').TextSourceMap} [sourceMap] + * @returns {string} + */ +export function convertHalfWidthKanaToFullWidth(text, sourceMap = null) { + let result = ''; + + // This function is safe to use charCodeAt instead of codePointAt, since all + // the relevant characters are represented with a single UTF-16 character code. + for (let i = 0, ii = text.length; i < ii; ++i) { + const c = text[i]; + const mapping = HALFWIDTH_KATAKANA_MAPPING.get(c); + if (typeof mapping !== 'string') { + result += c; + continue; + } + + let index = 0; + switch (text.charCodeAt(i + 1)) { + case 0xff9e: // dakuten + index = 1; + break; + case 0xff9f: // handakuten + index = 2; + break; + } + + let c2 = mapping[index]; + if (index > 0) { + if (c2 === '-') { // invalid + index = 0; + c2 = mapping[0]; + } else { + ++i; + } + } + + if (sourceMap !== null && index > 0) { + sourceMap.combine(result.length, 1); + } + result += c2; + } + + return result; +} + +/** + * @param {string} character + * @returns {?{character: string, type: import('japanese-util').DiacriticType}} + */ +export function getKanaDiacriticInfo(character) { + const info = DIACRITIC_MAPPING.get(character); + return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null; +} + + +// Furigana distribution + +/** + * @param {string} term + * @param {string} reading + * @returns {import('japanese-util').FuriganaSegment[]} + */ +export function distributeFurigana(term, reading) { + if (reading === term) { + // Same + return [createFuriganaSegment(term, '')]; + } + + /** @type {import('japanese-util').FuriganaGroup[]} */ + const groups = []; + /** @type {?import('japanese-util').FuriganaGroup} */ + let groupPre = null; + let isKanaPre = null; + for (const c of term) { + const codePoint = /** @type {number} */ (c.codePointAt(0)); + const isKana = isCodePointKana(codePoint); + if (isKana === isKanaPre) { + /** @type {import('japanese-util').FuriganaGroup} */ (groupPre).text += c; + } else { + groupPre = {isKana, text: c, textNormalized: null}; + groups.push(groupPre); + isKanaPre = isKana; + } + } + for (const group of groups) { + if (group.isKana) { + group.textNormalized = convertKatakanaToHiragana(group.text); + } + } + + const readingNormalized = convertKatakanaToHiragana(reading); + const segments = segmentizeFurigana(reading, readingNormalized, groups, 0); + if (segments !== null) { + return segments; + } + + // Fallback + return [createFuriganaSegment(term, reading)]; +} + +/** + * @param {string} term + * @param {string} reading + * @param {string} source + * @returns {import('japanese-util').FuriganaSegment[]} + */ +export function distributeFuriganaInflected(term, reading, source) { + const termNormalized = convertKatakanaToHiragana(term); + const readingNormalized = convertKatakanaToHiragana(reading); + const sourceNormalized = convertKatakanaToHiragana(source); + + let mainText = term; + let stemLength = getStemLength(termNormalized, sourceNormalized); + + // Check if source is derived from the reading instead of the term + const readingStemLength = getStemLength(readingNormalized, sourceNormalized); + if (readingStemLength > 0 && readingStemLength >= stemLength) { + mainText = reading; + stemLength = readingStemLength; + reading = `${source.substring(0, stemLength)}${reading.substring(stemLength)}`; + } + + const segments = []; + if (stemLength > 0) { + mainText = `${source.substring(0, stemLength)}${mainText.substring(stemLength)}`; + const segments2 = distributeFurigana(mainText, reading); + let consumed = 0; + for (const segment of segments2) { + const {text} = segment; + const start = consumed; + consumed += text.length; + if (consumed < stemLength) { + segments.push(segment); + } else if (consumed === stemLength) { + segments.push(segment); + break; + } else { + if (start < stemLength) { + segments.push(createFuriganaSegment(mainText.substring(start, stemLength), '')); + } + break; + } + } + } + + if (stemLength < source.length) { + const remainder = source.substring(stemLength); + const segmentCount = segments.length; + if (segmentCount > 0 && segments[segmentCount - 1].reading.length === 0) { + // Append to the last segment if it has an empty reading + segments[segmentCount - 1].text += remainder; + } else { + // Otherwise, create a new segment + segments.push(createFuriganaSegment(remainder, '')); + } + } + + return segments; +} + + +// Miscellaneous + +/** + * @param {string} text + * @param {boolean} fullCollapse + * @param {?import('../general/text-source-map.js').TextSourceMap} [sourceMap] + * @returns {string} + */ +export function collapseEmphaticSequences(text, fullCollapse, sourceMap = null) { + let result = ''; + let collapseCodePoint = -1; + const hasSourceMap = (sourceMap !== null); + for (const char of text) { + const c = char.codePointAt(0); + if ( + c === HIRAGANA_SMALL_TSU_CODE_POINT || + c === KATAKANA_SMALL_TSU_CODE_POINT || + c === KANA_PROLONGED_SOUND_MARK_CODE_POINT + ) { + if (collapseCodePoint !== c) { + collapseCodePoint = c; + if (!fullCollapse) { + result += char; + continue; + } + } + } else { + collapseCodePoint = -1; + result += char; + continue; + } + + if (hasSourceMap) { + sourceMap.combine(Math.max(0, result.length - 1), 1); + } + } + return result; +} diff --git a/ext/js/language/sandbox/japanese-util.js b/ext/js/language/sandbox/japanese-util.js deleted file mode 100644 index f9874cd480..0000000000 --- a/ext/js/language/sandbox/japanese-util.js +++ /dev/null @@ -1,885 +0,0 @@ -/* - * Copyright (C) 2023-2024 Yomitan Authors - * Copyright (C) 2020-2022 Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; -const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; -const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5; -const KATAKANA_SMALL_KE_CODE_POINT = 0x30f6; -const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc; - -/** @type {import('japanese-util').CodepointRange} */ -const HIRAGANA_RANGE = [0x3040, 0x309f]; -/** @type {import('japanese-util').CodepointRange} */ -const KATAKANA_RANGE = [0x30a0, 0x30ff]; - -/** @type {import('japanese-util').CodepointRange} */ -const HIRAGANA_CONVERSION_RANGE = [0x3041, 0x3096]; -/** @type {import('japanese-util').CodepointRange} */ -const KATAKANA_CONVERSION_RANGE = [0x30a1, 0x30f6]; - -/** @type {import('japanese-util').CodepointRange[]} */ -const KANA_RANGES = [HIRAGANA_RANGE, KATAKANA_RANGE]; - -/** @type {import('japanese-util').CodepointRange} */ -const CJK_UNIFIED_IDEOGRAPHS_RANGE = [0x4e00, 0x9fff]; -/** @type {import('japanese-util').CodepointRange} */ -const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE = [0x3400, 0x4dbf]; -/** @type {import('japanese-util').CodepointRange} */ -const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE = [0x20000, 0x2a6df]; -/** @type {import('japanese-util').CodepointRange} */ -const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_RANGE = [0x2a700, 0x2b73f]; -/** @type {import('japanese-util').CodepointRange} */ -const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE = [0x2b740, 0x2b81f]; -/** @type {import('japanese-util').CodepointRange} */ -const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE = [0x2b820, 0x2ceaf]; -/** @type {import('japanese-util').CodepointRange} */ -const CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE = [0x2ceb0, 0x2ebef]; -/** @type {import('japanese-util').CodepointRange} */ -const CJK_COMPATIBILITY_IDEOGRAPHS_RANGE = [0xf900, 0xfaff]; -/** @type {import('japanese-util').CodepointRange} */ -const CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE = [0x2f800, 0x2fa1f]; -/** @type {import('japanese-util').CodepointRange[]} */ -const CJK_IDEOGRAPH_RANGES = [ - CJK_UNIFIED_IDEOGRAPHS_RANGE, - CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_RANGE, - CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_RANGE, - CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_RANGE, - CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_RANGE, - CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E_RANGE, - CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F_RANGE, - CJK_COMPATIBILITY_IDEOGRAPHS_RANGE, - CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_RANGE -]; - -/** - * Japanese character ranges, roughly ordered in order of expected frequency. - * @type {import('japanese-util').CodepointRange[]} - */ -const JAPANESE_RANGES = [ - HIRAGANA_RANGE, - KATAKANA_RANGE, - - ...CJK_IDEOGRAPH_RANGES, - - [0xff66, 0xff9f], // Halfwidth katakana - - [0x30fb, 0x30fc], // Katakana punctuation - [0xff61, 0xff65], // Kana punctuation - [0x3000, 0x303f], // CJK punctuation - - [0xff10, 0xff19], // Fullwidth numbers - [0xff21, 0xff3a], // Fullwidth upper case Latin letters - [0xff41, 0xff5a], // Fullwidth lower case Latin letters - - [0xff01, 0xff0f], // Fullwidth punctuation 1 - [0xff1a, 0xff1f], // Fullwidth punctuation 2 - [0xff3b, 0xff3f], // Fullwidth punctuation 3 - [0xff5b, 0xff60], // Fullwidth punctuation 4 - [0xffe0, 0xffee] // Currency markers -]; - -const SMALL_KANA_SET = new Set(Array.from('ぁぃぅぇぉゃゅょゎァィゥェォャュョヮ')); - -const HALFWIDTH_KATAKANA_MAPPING = new Map([ - ['ヲ', 'ヲヺ-'], - ['ァ', 'ァ--'], - ['ィ', 'ィ--'], - ['ゥ', 'ゥ--'], - ['ェ', 'ェ--'], - ['ォ', 'ォ--'], - ['ャ', 'ャ--'], - ['ュ', 'ュ--'], - ['ョ', 'ョ--'], - ['ッ', 'ッ--'], - ['ー', 'ー--'], - ['ア', 'ア--'], - ['イ', 'イ--'], - ['ウ', 'ウヴ-'], - ['エ', 'エ--'], - ['オ', 'オ--'], - ['カ', 'カガ-'], - ['キ', 'キギ-'], - ['ク', 'クグ-'], - ['ケ', 'ケゲ-'], - ['コ', 'コゴ-'], - ['サ', 'サザ-'], - ['シ', 'シジ-'], - ['ス', 'スズ-'], - ['セ', 'セゼ-'], - ['ソ', 'ソゾ-'], - ['タ', 'タダ-'], - ['チ', 'チヂ-'], - ['ツ', 'ツヅ-'], - ['テ', 'テデ-'], - ['ト', 'トド-'], - ['ナ', 'ナ--'], - ['ニ', 'ニ--'], - ['ヌ', 'ヌ--'], - ['ネ', 'ネ--'], - ['ノ', 'ノ--'], - ['ハ', 'ハバパ'], - ['ヒ', 'ヒビピ'], - ['フ', 'フブプ'], - ['ヘ', 'ヘベペ'], - ['ホ', 'ホボポ'], - ['マ', 'マ--'], - ['ミ', 'ミ--'], - ['ム', 'ム--'], - ['メ', 'メ--'], - ['モ', 'モ--'], - ['ヤ', 'ヤ--'], - ['ユ', 'ユ--'], - ['ヨ', 'ヨ--'], - ['ラ', 'ラ--'], - ['リ', 'リ--'], - ['ル', 'ル--'], - ['レ', 'レ--'], - ['ロ', 'ロ--'], - ['ワ', 'ワ--'], - ['ン', 'ン--'] -]); - -const VOWEL_TO_KANA_MAPPING = new Map([ - ['a', 'ぁあかがさざただなはばぱまゃやらゎわヵァアカガサザタダナハバパマャヤラヮワヵヷ'], - ['i', 'ぃいきぎしじちぢにひびぴみりゐィイキギシジチヂニヒビピミリヰヸ'], - ['u', 'ぅうくぐすずっつづぬふぶぷむゅゆるゥウクグスズッツヅヌフブプムュユルヴ'], - ['e', 'ぇえけげせぜてでねへべぺめれゑヶェエケゲセゼテデネヘベペメレヱヶヹ'], - ['o', 'ぉおこごそぞとどのほぼぽもょよろをォオコゴソゾトドノホボポモョヨロヲヺ'], - ['', 'のノ'] -]); - -/** @type {Map} */ -const KANA_TO_VOWEL_MAPPING = new Map(); -for (const [vowel, characters] of VOWEL_TO_KANA_MAPPING) { - for (const character of characters) { - KANA_TO_VOWEL_MAPPING.set(character, vowel); - } -} - -const kana = 'うゔ-かが-きぎ-くぐ-けげ-こご-さざ-しじ-すず-せぜ-そぞ-ただ-ちぢ-つづ-てで-とど-はばぱひびぴふぶぷへべぺほぼぽワヷ-ヰヸ-ウヴ-ヱヹ-ヲヺ-カガ-キギ-クグ-ケゲ-コゴ-サザ-シジ-スズ-セゼ-ソゾ-タダ-チヂ-ツヅ-テデ-トド-ハバパヒビピフブプヘベペホボポ'; -/** @type {Map} */ -const DIACRITIC_MAPPING = new Map(); -for (let i = 0, ii = kana.length; i < ii; i += 3) { - const character = kana[i]; - const dakuten = kana[i + 1]; - const handakuten = kana[i + 2]; - DIACRITIC_MAPPING.set(dakuten, {character, type: 'dakuten'}); - if (handakuten !== '-') { - DIACRITIC_MAPPING.set(handakuten, {character, type: 'handakuten'}); - } -} - - -/** - * @param {number} codePoint - * @param {import('japanese-util').CodepointRange} range - * @returns {boolean} - */ -function isCodePointInRange(codePoint, [min, max]) { - return (codePoint >= min && codePoint <= max); -} - -/** - * @param {number} codePoint - * @param {import('japanese-util').CodepointRange[]} ranges - * @returns {boolean} - */ -function isCodePointInRanges(codePoint, ranges) { - for (const [min, max] of ranges) { - if (codePoint >= min && codePoint <= max) { - return true; - } - } - return false; -} - -/** - * @param {string} previousCharacter - * @returns {?string} - */ -function getProlongedHiragana(previousCharacter) { - switch (KANA_TO_VOWEL_MAPPING.get(previousCharacter)) { - case 'a': return 'あ'; - case 'i': return 'い'; - case 'u': return 'う'; - case 'e': return 'え'; - case 'o': return 'う'; - default: return null; - } -} - - -export class JapaneseUtil { - /** - * @param {?import('wanakana')|import('../../../lib/wanakana.js')} wanakana - */ - constructor(wanakana = null) { - /** @type {?import('wanakana')} */ - this._wanakana = /** @type {import('wanakana')} */ (wanakana); - } - - // Character code testing functions - - /** - * @param {number} codePoint - * @returns {boolean} - */ - isCodePointKanji(codePoint) { - return isCodePointInRanges(codePoint, CJK_IDEOGRAPH_RANGES); - } - - /** - * @param {number} codePoint - * @returns {boolean} - */ - isCodePointKana(codePoint) { - return isCodePointInRanges(codePoint, KANA_RANGES); - } - - /** - * @param {number} codePoint - * @returns {boolean} - */ - isCodePointJapanese(codePoint) { - return isCodePointInRanges(codePoint, JAPANESE_RANGES); - } - - // String testing functions - - /** - * @param {string} str - * @returns {boolean} - */ - isStringEntirelyKana(str) { - if (str.length === 0) { return false; } - for (const c of str) { - if (!isCodePointInRanges(/** @type {number} */ (c.codePointAt(0)), KANA_RANGES)) { - return false; - } - } - return true; - } - - /** - * @param {string} str - * @returns {boolean} - */ - isStringPartiallyJapanese(str) { - if (str.length === 0) { return false; } - for (const c of str) { - if (isCodePointInRanges(/** @type {number} */ (c.codePointAt(0)), JAPANESE_RANGES)) { - return true; - } - } - return false; - } - - // Mora functions - - /** - * @param {number} moraIndex - * @param {number} pitchAccentDownstepPosition - * @returns {boolean} - */ - isMoraPitchHigh(moraIndex, pitchAccentDownstepPosition) { - switch (pitchAccentDownstepPosition) { - case 0: return (moraIndex > 0); - case 1: return (moraIndex < 1); - default: return (moraIndex > 0 && moraIndex < pitchAccentDownstepPosition); - } - } - - /** - * @param {string} text - * @param {number} pitchAccentDownstepPosition - * @param {boolean} isVerbOrAdjective - * @returns {?import('japanese-util').PitchCategory} - */ - getPitchCategory(text, pitchAccentDownstepPosition, isVerbOrAdjective) { - if (pitchAccentDownstepPosition === 0) { - return 'heiban'; - } - if (isVerbOrAdjective) { - return pitchAccentDownstepPosition > 0 ? 'kifuku' : null; - } - if (pitchAccentDownstepPosition === 1) { - return 'atamadaka'; - } - if (pitchAccentDownstepPosition > 1) { - return pitchAccentDownstepPosition >= this.getKanaMoraCount(text) ? 'odaka' : 'nakadaka'; - } - return null; - } - - /** - * @param {string} text - * @returns {string[]} - */ - getKanaMorae(text) { - const morae = []; - let i; - for (const c of text) { - if (SMALL_KANA_SET.has(c) && (i = morae.length) > 0) { - morae[i - 1] += c; - } else { - morae.push(c); - } - } - return morae; - } - - /** - * @param {string} text - * @returns {number} - */ - getKanaMoraCount(text) { - let moraCount = 0; - for (const c of text) { - if (!(SMALL_KANA_SET.has(c) && moraCount > 0)) { - ++moraCount; - } - } - return moraCount; - } - - // Conversion functions - - /** - * @param {string} text - * @returns {string} - */ - convertToKana(text) { - return this._getWanakana().toKana(text); - } - - /** - * @returns {boolean} - */ - convertToKanaSupported() { - return this._wanakana !== null; - } - - /** - * @param {string} text - * @param {boolean} [keepProlongedSoundMarks] - * @returns {string} - */ - convertKatakanaToHiragana(text, keepProlongedSoundMarks = false) { - let result = ''; - const offset = (HIRAGANA_CONVERSION_RANGE[0] - KATAKANA_CONVERSION_RANGE[0]); - for (let char of text) { - const codePoint = /** @type {number} */ (char.codePointAt(0)); - switch (codePoint) { - case KATAKANA_SMALL_KA_CODE_POINT: - case KATAKANA_SMALL_KE_CODE_POINT: - // No change - break; - case KANA_PROLONGED_SOUND_MARK_CODE_POINT: - if (!keepProlongedSoundMarks && result.length > 0) { - const char2 = getProlongedHiragana(result[result.length - 1]); - if (char2 !== null) { char = char2; } - } - break; - default: - if (isCodePointInRange(codePoint, KATAKANA_CONVERSION_RANGE)) { - char = String.fromCodePoint(codePoint + offset); - } - break; - } - result += char; - } - return result; - } - - /** - * @param {string} text - * @returns {string} - */ - convertHiraganaToKatakana(text) { - let result = ''; - const offset = (KATAKANA_CONVERSION_RANGE[0] - HIRAGANA_CONVERSION_RANGE[0]); - for (let char of text) { - const codePoint = /** @type {number} */ (char.codePointAt(0)); - if (isCodePointInRange(codePoint, HIRAGANA_CONVERSION_RANGE)) { - char = String.fromCodePoint(codePoint + offset); - } - result += char; - } - return result; - } - - /** - * @param {string} text - * @returns {string} - */ - convertToRomaji(text) { - const wanakana = this._getWanakana(); - return wanakana.toRomaji(text); - } - - /** - * @returns {boolean} - */ - convertToRomajiSupported() { - return this._wanakana !== null; - } - - /** - * @param {string} text - * @returns {string} - */ - convertNumericToFullWidth(text) { - let result = ''; - for (const char of text) { - let c = /** @type {number} */ (char.codePointAt(0)); - if (c >= 0x30 && c <= 0x39) { // ['0', '9'] - c += 0xff10 - 0x30; // 0xff10 = '0' full width - result += String.fromCodePoint(c); - } else { - result += char; - } - } - return result; - } - - /** - * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} [sourceMap] - * @returns {string} - */ - convertHalfWidthKanaToFullWidth(text, sourceMap = null) { - let result = ''; - - // This function is safe to use charCodeAt instead of codePointAt, since all - // the relevant characters are represented with a single UTF-16 character code. - for (let i = 0, ii = text.length; i < ii; ++i) { - const c = text[i]; - const mapping = HALFWIDTH_KATAKANA_MAPPING.get(c); - if (typeof mapping !== 'string') { - result += c; - continue; - } - - let index = 0; - switch (text.charCodeAt(i + 1)) { - case 0xff9e: // dakuten - index = 1; - break; - case 0xff9f: // handakuten - index = 2; - break; - } - - let c2 = mapping[index]; - if (index > 0) { - if (c2 === '-') { // invalid - index = 0; - c2 = mapping[0]; - } else { - ++i; - } - } - - if (sourceMap !== null && index > 0) { - sourceMap.combine(result.length, 1); - } - result += c2; - } - - return result; - } - - /** - * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap - * @returns {string} - */ - convertAlphabeticToKana(text, sourceMap = null) { - let part = ''; - let result = ''; - - for (const char of text) { - // Note: 0x61 is the character code for 'a' - let c = /** @type {number} */ (char.codePointAt(0)); - if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z'] - c += (0x61 - 0x41); - } else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z'] - // NOP; c += (0x61 - 0x61); - } else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z'] fullwidth - c += (0x61 - 0xff21); - } else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z'] fullwidth - c += (0x61 - 0xff41); - } else if (c === 0x2d || c === 0xff0d) { // '-' or fullwidth dash - c = 0x2d; // '-' - } else { - if (part.length > 0) { - result += this._convertAlphabeticPartToKana(part, sourceMap, result.length); - part = ''; - } - result += char; - continue; - } - part += String.fromCodePoint(c); - } - - if (part.length > 0) { - result += this._convertAlphabeticPartToKana(part, sourceMap, result.length); - } - return result; - } - - /** - * @returns {boolean} - */ - convertAlphabeticToKanaSupported() { - return this._wanakana !== null; - } - - /** - * @param {string} character - * @returns {?{character: string, type: import('japanese-util').DiacriticType}} - */ - getKanaDiacriticInfo(character) { - const info = DIACRITIC_MAPPING.get(character); - return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null; - } - - // Furigana distribution - - /** - * @param {string} term - * @param {string} reading - * @returns {import('japanese-util').FuriganaSegment[]} - */ - distributeFurigana(term, reading) { - if (reading === term) { - // Same - return [this._createFuriganaSegment(term, '')]; - } - - /** @type {import('japanese-util').FuriganaGroup[]} */ - const groups = []; - /** @type {?import('japanese-util').FuriganaGroup} */ - let groupPre = null; - let isKanaPre = null; - for (const c of term) { - const codePoint = /** @type {number} */ (c.codePointAt(0)); - const isKana = this.isCodePointKana(codePoint); - if (isKana === isKanaPre) { - /** @type {import('japanese-util').FuriganaGroup} */ (groupPre).text += c; - } else { - groupPre = {isKana, text: c, textNormalized: null}; - groups.push(groupPre); - isKanaPre = isKana; - } - } - for (const group of groups) { - if (group.isKana) { - group.textNormalized = this.convertKatakanaToHiragana(group.text); - } - } - - const readingNormalized = this.convertKatakanaToHiragana(reading); - const segments = this._segmentizeFurigana(reading, readingNormalized, groups, 0); - if (segments !== null) { - return segments; - } - - // Fallback - return [this._createFuriganaSegment(term, reading)]; - } - - /** - * @param {string} term - * @param {string} reading - * @param {string} source - * @returns {import('japanese-util').FuriganaSegment[]} - */ - distributeFuriganaInflected(term, reading, source) { - const termNormalized = this.convertKatakanaToHiragana(term); - const readingNormalized = this.convertKatakanaToHiragana(reading); - const sourceNormalized = this.convertKatakanaToHiragana(source); - - let mainText = term; - let stemLength = this._getStemLength(termNormalized, sourceNormalized); - - // Check if source is derived from the reading instead of the term - const readingStemLength = this._getStemLength(readingNormalized, sourceNormalized); - if (readingStemLength > 0 && readingStemLength >= stemLength) { - mainText = reading; - stemLength = readingStemLength; - reading = `${source.substring(0, stemLength)}${reading.substring(stemLength)}`; - } - - const segments = []; - if (stemLength > 0) { - mainText = `${source.substring(0, stemLength)}${mainText.substring(stemLength)}`; - const segments2 = this.distributeFurigana(mainText, reading); - let consumed = 0; - for (const segment of segments2) { - const {text} = segment; - const start = consumed; - consumed += text.length; - if (consumed < stemLength) { - segments.push(segment); - } else if (consumed === stemLength) { - segments.push(segment); - break; - } else { - if (start < stemLength) { - segments.push(this._createFuriganaSegment(mainText.substring(start, stemLength), '')); - } - break; - } - } - } - - if (stemLength < source.length) { - const remainder = source.substring(stemLength); - const segmentCount = segments.length; - if (segmentCount > 0 && segments[segmentCount - 1].reading.length === 0) { - // Append to the last segment if it has an empty reading - segments[segmentCount - 1].text += remainder; - } else { - // Otherwise, create a new segment - segments.push(this._createFuriganaSegment(remainder, '')); - } - } - - return segments; - } - - // Miscellaneous - - /** - * @param {string} text - * @param {boolean} fullCollapse - * @param {?import('../../general/text-source-map.js').TextSourceMap} [sourceMap] - * @returns {string} - */ - collapseEmphaticSequences(text, fullCollapse, sourceMap = null) { - let result = ''; - let collapseCodePoint = -1; - const hasSourceMap = (sourceMap !== null); - for (const char of text) { - const c = char.codePointAt(0); - if ( - c === HIRAGANA_SMALL_TSU_CODE_POINT || - c === KATAKANA_SMALL_TSU_CODE_POINT || - c === KANA_PROLONGED_SOUND_MARK_CODE_POINT - ) { - if (collapseCodePoint !== c) { - collapseCodePoint = c; - if (!fullCollapse) { - result += char; - continue; - } - } - } else { - collapseCodePoint = -1; - result += char; - continue; - } - - if (hasSourceMap) { - sourceMap.combine(Math.max(0, result.length - 1), 1); - } - } - return result; - } - - // Private - - /** - * @param {string} text - * @param {string} reading - * @returns {import('japanese-util').FuriganaSegment} - */ - _createFuriganaSegment(text, reading) { - return {text, reading}; - } - - /** - * @param {string} reading - * @param {string} readingNormalized - * @param {import('japanese-util').FuriganaGroup[]} groups - * @param {number} groupsStart - * @returns {?(import('japanese-util').FuriganaSegment[])} - */ - _segmentizeFurigana(reading, readingNormalized, groups, groupsStart) { - const groupCount = groups.length - groupsStart; - if (groupCount <= 0) { - return reading.length === 0 ? [] : null; - } - - const group = groups[groupsStart]; - const {isKana, text} = group; - const textLength = text.length; - if (isKana) { - const {textNormalized} = group; - if (textNormalized !== null && readingNormalized.startsWith(textNormalized)) { - const segments = this._segmentizeFurigana( - reading.substring(textLength), - readingNormalized.substring(textLength), - groups, - groupsStart + 1 - ); - if (segments !== null) { - if (reading.startsWith(text)) { - segments.unshift(this._createFuriganaSegment(text, '')); - } else { - segments.unshift(...this._getFuriganaKanaSegments(text, reading)); - } - return segments; - } - } - return null; - } else { - let result = null; - for (let i = reading.length; i >= textLength; --i) { - const segments = this._segmentizeFurigana( - reading.substring(i), - readingNormalized.substring(i), - groups, - groupsStart + 1 - ); - if (segments !== null) { - if (result !== null) { - // More than one way to segmentize the tail; mark as ambiguous - return null; - } - const segmentReading = reading.substring(0, i); - segments.unshift(this._createFuriganaSegment(text, segmentReading)); - result = segments; - } - // There is only one way to segmentize the last non-kana group - if (groupCount === 1) { - break; - } - } - return result; - } - } - - /** - * @param {string} text - * @param {string} reading - * @returns {import('japanese-util').FuriganaSegment[]} - */ - _getFuriganaKanaSegments(text, reading) { - const textLength = text.length; - const newSegments = []; - let start = 0; - let state = (reading[0] === text[0]); - for (let i = 1; i < textLength; ++i) { - const newState = (reading[i] === text[i]); - if (state === newState) { continue; } - newSegments.push(this._createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i))); - state = newState; - start = i; - } - newSegments.push(this._createFuriganaSegment(text.substring(start, textLength), state ? '' : reading.substring(start, textLength))); - return newSegments; - } - - /** - * @returns {import('wanakana')} - * @throws {Error} - */ - _getWanakana() { - const wanakana = this._wanakana; - if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); } - return wanakana; - } - - /** - * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap - * @param {number} sourceMapStart - * @returns {string} - */ - _convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) { - const wanakana = this._getWanakana(); - const result = wanakana.toHiragana(text); - - // Generate source mapping - if (sourceMap !== null) { - let i = 0; - let resultPos = 0; - const ii = text.length; - while (i < ii) { - // Find smallest matching substring - let iNext = i + 1; - let resultPosNext = result.length; - while (iNext < ii) { - const t = wanakana.toHiragana(text.substring(0, iNext)); - if (t === result.substring(0, t.length)) { - resultPosNext = t.length; - break; - } - ++iNext; - } - - // Merge characters - const removals = iNext - i - 1; - if (removals > 0) { - sourceMap.combine(sourceMapStart, removals); - } - ++sourceMapStart; - - // Empty elements - const additions = resultPosNext - resultPos - 1; - for (let j = 0; j < additions; ++j) { - sourceMap.insert(sourceMapStart, 0); - ++sourceMapStart; - } - - i = iNext; - resultPos = resultPosNext; - } - } - - return result; - } - - /** - * @param {string} text1 - * @param {string} text2 - * @returns {number} - */ - _getStemLength(text1, text2) { - const minLength = Math.min(text1.length, text2.length); - if (minLength === 0) { return 0; } - - let i = 0; - while (true) { - const char1 = /** @type {number} */ (text1.codePointAt(i)); - const char2 = /** @type {number} */ (text2.codePointAt(i)); - if (char1 !== char2) { break; } - const charLength = String.fromCodePoint(char1).length; - i += charLength; - if (i >= minLength) { - if (i > minLength) { - i -= charLength; // Don't consume partial UTF16 surrogate characters - } - break; - } - } - return i; - } -} diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index cedc7d3d18..66eeb69fb5 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -19,6 +19,8 @@ import {RegexUtil} from '../general/regex-util.js'; import {TextSourceMap} from '../general/text-source-map.js'; import {Deinflector} from './deinflector.js'; +import {convertAlphabeticToKana} from './japanese-wanakana.js'; +import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './japanese.js'; /** * Class which finds term and kanji dictionary entries for text. @@ -28,9 +30,7 @@ export class Translator { * Creates a new Translator instance. * @param {import('translator').ConstructorDetails} details The details for the class. */ - constructor({japaneseUtil, database}) { - /** @type {import('./sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; + constructor({database}) { /** @type {import('../dictionary/dictionary-database.js').DictionaryDatabase} */ this._database = database; /** @type {?Deinflector} */ @@ -436,7 +436,6 @@ export class Translator { this._getCollapseEmphaticOptions(options) ]; - const jp = this._japaneseUtil; /** @type {import('translation-internal').DatabaseDeinflection[]} */ const deinflections = []; const used = new Set(); @@ -447,22 +446,22 @@ export class Translator { text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); } if (halfWidth) { - text2 = jp.convertHalfWidthKanaToFullWidth(text2, sourceMap); + text2 = convertHalfWidthKanaToFullWidth(text2, sourceMap); } if (numeric) { - text2 = jp.convertNumericToFullWidth(text2); + text2 = convertNumericToFullWidth(text2); } if (alphabetic) { - text2 = jp.convertAlphabeticToKana(text2, sourceMap); + text2 = convertAlphabeticToKana(text2, sourceMap); } if (katakana) { - text2 = jp.convertHiraganaToKatakana(text2); + text2 = convertHiraganaToKatakana(text2); } if (hiragana) { - text2 = jp.convertKatakanaToHiragana(text2); + text2 = convertKatakanaToHiragana(text2); } if (collapseEmphatic) { - text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + text2 = collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); } for ( @@ -519,10 +518,9 @@ export class Translator { * @returns {string} */ _getJapaneseOnlyText(text) { - const jp = this._japaneseUtil; let length = 0; for (const c of text) { - if (!jp.isCodePointJapanese(/** @type {number} */ (c.codePointAt(0)))) { + if (!isCodePointJapanese(/** @type {number} */ (c.codePointAt(0)))) { return text.substring(0, length); } length += c.length; diff --git a/ext/js/media/audio-downloader.js b/ext/js/media/audio-downloader.js index 3a3b21d0b7..b4f63b96c5 100644 --- a/ext/js/media/audio-downloader.js +++ b/ext/js/media/audio-downloader.js @@ -23,14 +23,13 @@ import {JsonSchema} from '../data/json-schema.js'; import {ArrayBufferUtil} from '../data/sandbox/array-buffer-util.js'; import {NativeSimpleDOMParser} from '../dom/native-simple-dom-parser.js'; import {SimpleDOMParser} from '../dom/simple-dom-parser.js'; +import {isStringEntirelyKana} from '../language/japanese.js'; export class AudioDownloader { /** - * @param {{japaneseUtil: import('../language/sandbox/japanese-util.js').JapaneseUtil, requestBuilder: RequestBuilder}} details + * @param {{requestBuilder: RequestBuilder}} details */ - constructor({japaneseUtil, requestBuilder}) { - /** @type {import('../language/sandbox/japanese-util.js').JapaneseUtil} */ - this._japaneseUtil = japaneseUtil; + constructor({requestBuilder}) { /** @type {RequestBuilder} */ this._requestBuilder = requestBuilder; /** @type {?JsonSchema} */ @@ -111,7 +110,7 @@ export class AudioDownloader { /** @type {import('audio-downloader').GetInfoHandler} */ async _getInfoJpod101(term, reading) { - if (reading === term && this._japaneseUtil.isStringEntirelyKana(term)) { + if (reading === term && isStringEntirelyKana(term)) { reading = term; term = ''; } diff --git a/ext/js/pages/settings/anki-templates-controller.js b/ext/js/pages/settings/anki-templates-controller.js index 910e99acf0..869c9e16e0 100644 --- a/ext/js/pages/settings/anki-templates-controller.js +++ b/ext/js/pages/settings/anki-templates-controller.js @@ -20,7 +20,6 @@ import {ExtensionError} from '../../core/extension-error.js'; import {toError} from '../../core/to-error.js'; import {AnkiNoteBuilder} from '../../data/anki-note-builder.js'; import {querySelectorNotNull} from '../../dom/query-selector.js'; -import {JapaneseUtil} from '../../language/sandbox/japanese-util.js'; import {TemplateRendererProxy} from '../../templates/template-renderer-proxy.js'; import {yomitan} from '../../yomitan.js'; @@ -56,7 +55,7 @@ export class AnkiTemplatesController { /** @type {?import('./modal.js').Modal} */ this._fieldTemplateResetModal = null; /** @type {AnkiNoteBuilder} */ - this._ankiNoteBuilder = new AnkiNoteBuilder(new JapaneseUtil(null), new TemplateRendererProxy()); + this._ankiNoteBuilder = new AnkiNoteBuilder(new TemplateRendererProxy()); } /** */ diff --git a/ext/js/templates/sandbox/anki-template-renderer.js b/ext/js/templates/sandbox/anki-template-renderer.js index ef2c161062..52087336b3 100644 --- a/ext/js/templates/sandbox/anki-template-renderer.js +++ b/ext/js/templates/sandbox/anki-template-renderer.js @@ -22,7 +22,7 @@ import {DictionaryDataUtil} from '../../dictionary/dictionary-data-util.js'; import {PronunciationGenerator} from '../../display/sandbox/pronunciation-generator.js'; import {StructuredContentGenerator} from '../../display/sandbox/structured-content-generator.js'; import {CssStyleApplier} from '../../dom/sandbox/css-style-applier.js'; -import {JapaneseUtil} from '../../language/sandbox/japanese-util.js'; +import {convertHiraganaToKatakana, convertKatakanaToHiragana, distributeFurigana, getKanaMorae, getPitchCategory, isMoraPitchHigh} from '../../language/japanese.js'; import {AnkiTemplateRendererContentManager} from './anki-template-renderer-content-manager.js'; import {TemplateRendererMediaProvider} from './template-renderer-media-provider.js'; import {TemplateRenderer} from './template-renderer.js'; @@ -42,16 +42,14 @@ export class AnkiTemplateRenderer { this._pronunciationStyleApplier = new CssStyleApplier('/data/pronunciation-style.json'); /** @type {RegExp} */ this._structuredContentDatasetKeyIgnorePattern = /^sc([^a-z]|$)/; - /** @type {JapaneseUtil} */ - this._japaneseUtil = new JapaneseUtil(null); /** @type {TemplateRenderer} */ this._templateRenderer = new TemplateRenderer(); /** @type {AnkiNoteDataCreator} */ - this._ankiNoteDataCreator = new AnkiNoteDataCreator(this._japaneseUtil); + this._ankiNoteDataCreator = new AnkiNoteDataCreator(); /** @type {TemplateRendererMediaProvider} */ this._mediaProvider = new TemplateRendererMediaProvider(); /** @type {PronunciationGenerator} */ - this._pronunciationGenerator = new PronunciationGenerator(this._japaneseUtil); + this._pronunciationGenerator = new PronunciationGenerator(); /** @type {?(Map[])} */ this._stateStack = null; /** @type {?import('anki-note-builder').Requirement[]} */ @@ -171,7 +169,7 @@ export class AnkiTemplateRenderer { /** @type {import('template-renderer').HelperFunction} */ _furigana(args, context, options) { const {expression, reading} = this._getFuriganaExpressionAndReading(args, context, options); - const segments = this._japaneseUtil.distributeFurigana(expression, reading); + const segments = distributeFurigana(expression, reading); let result = ''; for (const {text, reading: reading2} of segments) { @@ -190,7 +188,7 @@ export class AnkiTemplateRenderer { /** @type {import('template-renderer').HelperFunction} */ _furiganaPlain(args, context, options) { const {expression, reading} = this._getFuriganaExpressionAndReading(args, context, options); - const segments = this._japaneseUtil.distributeFurigana(expression, reading); + const segments = distributeFurigana(expression, reading); let result = ''; for (const {text, reading: reading2} of segments) { @@ -512,13 +510,13 @@ export class AnkiTemplateRenderer { /** @type {import('template-renderer').HelperFunction} */ _isMoraPitchHigh(args) { const [index, position] = /** @type {[index: number, position: number]} */ (args); - return this._japaneseUtil.isMoraPitchHigh(index, position); + return isMoraPitchHigh(index, position); } /** @type {import('template-renderer').HelperFunction} */ _getKanaMorae(args) { const [text] = /** @type {[text: string]} */ (args); - return this._japaneseUtil.getKanaMorae(`${text}`); + return getKanaMorae(`${text}`); } /** @type {import('template-renderer').HelperFunction} */ @@ -555,7 +553,7 @@ export class AnkiTemplateRenderer { const isVerbOrAdjective = DictionaryDataUtil.isNonNounVerbOrAdjective(wordClasses); const pitches = DictionaryDataUtil.getPronunciationsOfType(pronunciations, 'pitch-accent'); for (const {position} of pitches) { - const category = this._japaneseUtil.getPitchCategory(reading, position, isVerbOrAdjective); + const category = getPitchCategory(reading, position, isVerbOrAdjective); if (category !== null) { categories.add(category); } @@ -666,7 +664,7 @@ export class AnkiTemplateRenderer { */ _createStructuredContentGenerator(data) { const contentManager = new AnkiTemplateRendererContentManager(this._mediaProvider, data); - const instance = new StructuredContentGenerator(contentManager, this._japaneseUtil, document); + const instance = new StructuredContentGenerator(contentManager, document); this._cleanupCallbacks.push(() => contentManager.unloadAll()); return instance; } @@ -735,7 +733,7 @@ export class AnkiTemplateRenderer { if (typeof downstepPosition !== 'number') { return ''; } if (!Array.isArray(nasalPositions)) { nasalPositions = []; } if (!Array.isArray(devoicePositions)) { devoicePositions = []; } - const morae = this._japaneseUtil.getKanaMorae(reading); + const morae = getKanaMorae(reading); switch (format) { case 'text': @@ -756,7 +754,7 @@ export class AnkiTemplateRenderer { const ii = args.length; const {keepProlongedSoundMarks} = options.hash; const value = (ii > 0 ? args[0] : this._computeValue(options, context)); - return typeof value === 'string' ? this._japaneseUtil.convertKatakanaToHiragana(value, keepProlongedSoundMarks === true) : ''; + return typeof value === 'string' ? convertKatakanaToHiragana(value, keepProlongedSoundMarks === true) : ''; } /** @@ -765,7 +763,7 @@ export class AnkiTemplateRenderer { _katakana(args, context, options) { const ii = args.length; const value = (ii > 0 ? args[0] : this._computeValue(options, context)); - return typeof value === 'string' ? this._japaneseUtil.convertHiraganaToKatakana(value) : ''; + return typeof value === 'string' ? convertHiraganaToKatakana(value) : ''; } /** diff --git a/test/fixtures/translator-test.js b/test/fixtures/translator-test.js index f162972dc1..6562931cd8 100644 --- a/test/fixtures/translator-test.js +++ b/test/fixtures/translator-test.js @@ -26,7 +26,6 @@ import {createDictionaryArchive} from '../../dev/util.js'; import {AnkiNoteDataCreator} from '../../ext/js/data/sandbox/anki-note-data-creator.js'; import {DictionaryDatabase} from '../../ext/js/dictionary/dictionary-database.js'; import {DictionaryImporter} from '../../ext/js/dictionary/dictionary-importer.js'; -import {JapaneseUtil} from '../../ext/js/language/sandbox/japanese-util.js'; import {Translator} from '../../ext/js/language/translator.js'; import {chrome, fetch} from '../mocks/common.js'; import {DictionaryImporterMediaLoader} from '../mocks/dictionary-importer-media-loader.js'; @@ -65,14 +64,13 @@ async function createTranslatorContext(dictionaryDirectory, dictionaryName) { expect(errors.length).toEqual(0); // Setup translator - const japaneseUtil = new JapaneseUtil(null); - const translator = new Translator({japaneseUtil, database: dictionaryDatabase}); + const translator = new Translator({database: dictionaryDatabase}); /** @type {import('deinflector').ReasonsRaw} */ const deinflectionReasons = parseJson(readFileSync(deinflectionReasonsPath, {encoding: 'utf8'})); translator.prepare(deinflectionReasons); // Assign properties - const ankiNoteDataCreator = new AnkiNoteDataCreator(japaneseUtil); + const ankiNoteDataCreator = new AnkiNoteDataCreator(); return {translator, ankiNoteDataCreator}; } diff --git a/test/japanese-util.test.js b/test/japanese-util.test.js index ab14f209f3..d7b05c3e32 100644 --- a/test/japanese-util.test.js +++ b/test/japanese-util.test.js @@ -18,10 +18,8 @@ import {describe, expect, test} from 'vitest'; import {TextSourceMap} from '../ext/js/general/text-source-map.js'; -import {JapaneseUtil} from '../ext/js/language/sandbox/japanese-util.js'; -import * as wanakana from '../ext/lib/wanakana.js'; - -const jp = new JapaneseUtil(wanakana); +import * as jpw from '../ext/js/language/japanese-wanakana.js'; +import * as jp from '../ext/js/language/japanese.js'; /** */ function testIsCodePointKanji() { @@ -199,7 +197,7 @@ function testConvertToRomaji() { ]; test.each(data)('%s -> %o', (string, expected) => { - expect(jp.convertToRomaji(string)).toStrictEqual(expected); + expect(jpw.convertToRomaji(string)).toStrictEqual(expected); }); }); } @@ -268,8 +266,8 @@ function testConvertAlphabeticToKana() { for (const [string, expected, expectedSourceMapping] of data) { test(`${string} -> ${string}${typeof expectedSourceMapping !== 'undefined' ? ', ' + JSON.stringify(expectedSourceMapping) : ''}`, () => { const sourceMap = new TextSourceMap(string); - const actual1 = jp.convertAlphabeticToKana(string, null); - const actual2 = jp.convertAlphabeticToKana(string, sourceMap); + const actual1 = jpw.convertAlphabeticToKana(string, null); + const actual2 = jpw.convertAlphabeticToKana(string, sourceMap); expect(actual1).toStrictEqual(expected); expect(actual2).toStrictEqual(expected); if (typeof expectedSourceMapping !== 'undefined') { diff --git a/test/utilities/anki.js b/test/utilities/anki.js index 322acb0dd8..e30d578ffa 100644 --- a/test/utilities/anki.js +++ b/test/utilities/anki.js @@ -16,7 +16,6 @@ */ import {AnkiNoteBuilder} from '../../ext/js/data/anki-note-builder.js'; -import {JapaneseUtil} from '../../ext/js/language/sandbox/japanese-util.js'; import {AnkiTemplateRenderer} from '../../ext/js/templates/sandbox/anki-template-renderer.js'; /** @@ -130,7 +129,6 @@ export async function getTemplateRenderResults(dictionaryEntries, type, mode, te const ankiTemplateRenderer = new AnkiTemplateRenderer(); await ankiTemplateRenderer.prepare(); - const japaneseUtil = new JapaneseUtil(null); const clozePrefix = 'cloze-prefix'; const clozeSuffix = 'cloze-suffix'; const results = []; @@ -146,7 +144,7 @@ export async function getTemplateRenderResults(dictionaryEntries, type, mode, te } break; } - const ankiNoteBuilder = new AnkiNoteBuilder(japaneseUtil, ankiTemplateRenderer.templateRenderer); + const ankiNoteBuilder = new AnkiNoteBuilder(ankiTemplateRenderer.templateRenderer); const context = { url: 'url:', sentence: { diff --git a/types/ext/display.d.ts b/types/ext/display.d.ts index 8666265974..b11d54e17a 100644 --- a/types/ext/display.d.ts +++ b/types/ext/display.d.ts @@ -17,7 +17,6 @@ import type {DisplayContentManager} from '../../ext/js/display/display-content-manager'; import type {HotkeyHelpController} from '../../ext/js/input/hotkey-help-controller'; -import type {JapaneseUtil} from '../../ext/js/language/sandbox/japanese-util'; import type * as Dictionary from './dictionary'; import type * as Extension from './extension'; import type * as Settings from './settings'; @@ -128,7 +127,6 @@ export type GetSearchContextCallback = TextScannerTypes.GetSearchContextCallback export type QueryParserConstructorDetails = { getSearchContext: GetSearchContextCallback; - japaneseUtil: JapaneseUtil; }; export type QueryParserOptions = { @@ -169,7 +167,6 @@ export type Events = { export type EventArgument> = BaseEventArgument; export type DisplayGeneratorConstructorDetails = { - japaneseUtil: JapaneseUtil; contentManager: DisplayContentManager; hotkeyHelpController?: HotkeyHelpController | null; }; diff --git a/types/ext/translator.d.ts b/types/ext/translator.d.ts index 65a77e908e..5d552ca8dc 100644 --- a/types/ext/translator.d.ts +++ b/types/ext/translator.d.ts @@ -16,13 +16,10 @@ */ import type {DictionaryDatabase} from '../../ext/js/dictionary/dictionary-database'; -import type {JapaneseUtil} from '../../ext/js/language/sandbox/japanese-util'; import type * as Dictionary from './dictionary'; import type * as DictionaryDatabaseTypes from './dictionary-database'; export type ConstructorDetails = { - /** An instance of JapaneseUtil. */ - japaneseUtil: JapaneseUtil; /** An instance of DictionaryDatabase. */ database: DictionaryDatabase; };