diff --git a/.eslintrc.json b/.eslintrc.json
index 30d495d276..b50809b4ae 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -605,9 +605,13 @@
"ext/js/general/object-property-accessor.js",
"ext/js/general/regex-util.js",
"ext/js/general/text-source-map.js",
+ "ext/js/language/en/language-english.js",
"ext/js/language/ja/japanese-wanakana.js",
"ext/js/language/ja/japanese.js",
+ "ext/js/language/ja/language-japanese.js",
"ext/js/language/language-transformer.js",
+ "ext/js/language/languages.js",
+ "ext/js/language/text-preprocessors.js",
"ext/js/language/translator.js",
"ext/js/media/audio-downloader.js",
"ext/js/media/media-util.js",
diff --git a/benches/translator.bench.js b/benches/translator.bench.js
index 1231c31c11..8b9b5118ff 100644
--- a/benches/translator.bench.js
+++ b/benches/translator.bench.js
@@ -20,8 +20,8 @@ import {fileURLToPath} from 'node:url';
import path from 'path';
import {bench, describe} from 'vitest';
import {parseJson} from '../dev/json.js';
-import {createFindKanjiOptions, createFindTermsOptions} from '../test/utilities/translator.js';
import {createTranslatorContext} from '../test/fixtures/translator-test.js';
+import {createFindKanjiOptions, createFindTermsOptions} from '../test/utilities/translator.js';
const dirname = path.dirname(fileURLToPath(import.meta.url));
const dictionaryName = 'Test Dictionary 2';
@@ -33,10 +33,9 @@ describe('Translator', () => {
const {optionsPresets, tests} = parseJson(readFileSync(testInputsFilePath, {encoding: 'utf8'}));
const findKanjiTests = tests.filter((data) => data.options === 'kanji');
- const findTermTests = tests.filter((data) => data.options === 'default');
- const findTermWithTextTransformationsTests = tests.filter((data) => data.options !== 'kanji' && data.options !== 'default');
+ const findTermTests = tests.filter((data) => data.options !== 'kanji');
- bench(`Translator.prototype.findTerms - no text transformations (n=${findTermTests.length})`, async () => {
+ bench(`Translator.prototype.findTerms - (n=${findTermTests.length})`, async () => {
for (const data of /** @type {import('test/translator').TestInputFindTerm[]} */ (findTermTests)) {
const {mode, text} = data;
const options = createFindTermsOptions(dictionaryName, optionsPresets, data.options);
@@ -44,14 +43,6 @@ describe('Translator', () => {
}
});
- bench(`Translator.prototype.findTerms - text transformations (n=${findTermWithTextTransformationsTests.length})`, async () => {
- for (const data of /** @type {import('test/translator').TestInputFindTerm[]} */ (findTermWithTextTransformationsTests)) {
- const {mode, text} = data;
- const options = createFindTermsOptions(dictionaryName, optionsPresets, data.options);
- await translator.findTerms(mode, text, options);
- }
- });
-
bench(`Translator.prototype.findKanji - (n=${findKanjiTests.length})`, async () => {
for (const data of /** @type {import('test/translator').TestInputFindKanji[]} */ (findKanjiTests)) {
const {text} = data;
diff --git a/dev/jsconfig.json b/dev/jsconfig.json
index d94651083b..6a5fb13b2f 100644
--- a/dev/jsconfig.json
+++ b/dev/jsconfig.json
@@ -28,6 +28,9 @@
"error": ["../types/ext/error"],
"event-listener-collection": ["../types/ext/event-listener-collection"],
"japanese-util": ["../types/ext/japanese-util"],
+ "language": ["../types/ext/language"],
+ "language-english": ["../types/ext/language-english"],
+ "language-japanese": ["../types/ext/language-japanese"],
"ext/json-schema": ["../types/ext/json-schema"],
"language-transformer": ["../types/ext/language-transformer"],
"language-transformer-internal": ["../types/ext/language-transformer-internal"],
diff --git a/docs/anki-integration.md b/docs/anki-integration.md
index 2bd9fad99c..9bd5bd9443 100644
--- a/docs/anki-integration.md
+++ b/docs/anki-integration.md
@@ -23,71 +23,71 @@ Flashcard fields can be configured with the following steps:
#### Markers for Term Cards
- | Marker | Description |
- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
- | `{audio}` | Audio sample of a native speaker's pronunciation in MP3 format (if available). |
- | `{clipboard-image}` | An image which is stored in the system clipboard, if present. |
- | `{clipboard-text}` | Text which is stored in the system clipboard, if present. |
- | `{cloze-body}` | Raw, inflected term as it appeared before being reduced to dictionary form by Yomitan. |
- | `{cloze-body-kana}` | Kana reading for `{cloze-body}`. |
- | `{cloze-prefix}` | Fragment of the containing `{sentence}` starting at the beginning of `{sentence}` until the beginning of `{cloze-body}`. |
- | `{cloze-suffix}` | Fragment of the containing `{sentence}` starting at the end of `{cloze-body}` until the end of `{sentence}`. |
- | `{conjugation}` | Conjugation path from the raw inflected term to the source term. |
- | `{dictionary}` | Name of the dictionary from which the card is being created (unavailable in _grouped_ mode). |
- | `{document-title}` | Title of the web page that the term appeared in. |
- | `{expression}` | Term expressed as kanji (will be displayed in kana if kanji is not available). |
- | `{frequencies}` | Frequency information for the term. |
- | `{frequency-harmonic-rank}` | The harmonic mean of frequency data for the current term. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based term usage. |
+ | Marker | Description |
+ | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+ | `{audio}` | Audio sample of a native speaker's pronunciation in MP3 format (if available). |
+ | `{clipboard-image}` | An image which is stored in the system clipboard, if present. |
+ | `{clipboard-text}` | Text which is stored in the system clipboard, if present. |
+ | `{cloze-body}` | Raw, inflected term as it appeared before being reduced to dictionary form by Yomitan. |
+ | `{cloze-body-kana}` | Kana reading for `{cloze-body}`. |
+ | `{cloze-prefix}` | Fragment of the containing `{sentence}` starting at the beginning of `{sentence}` until the beginning of `{cloze-body}`. |
+ | `{cloze-suffix}` | Fragment of the containing `{sentence}` starting at the end of `{cloze-body}` until the end of `{sentence}`. |
+ | `{conjugation}` | Conjugation path from the raw inflected term to the source term. |
+ | `{dictionary}` | Name of the dictionary from which the card is being created (unavailable in _grouped_ mode). |
+ | `{document-title}` | Title of the web page that the term appeared in. |
+ | `{expression}` | Term expressed as kanji (will be displayed in kana if kanji is not available). |
+ | `{frequencies}` | Frequency information for the term. |
+ | `{frequency-harmonic-rank}` | The harmonic mean of frequency data for the current term. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based term usage. |
| `{frequency-harmonic-occurrence}` | The harmonic mean of frequency data for the current term. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based term usage. |
- | `{frequency-average-rank}` | The average of frequency data for the current term. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based term usage. |
- | `{frequency-average-occurrence}` | The average of frequency data for the current term. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based term usage. |
- | `{furigana}` | Term expressed as kanji with furigana displayed above it (e.g. 日本語). |
- | `{furigana-plain}` | Term expressed as kanji with furigana displayed next to it in brackets (e.g. 日本語[にほんご]). |
- | `{glossary}` | List of definitions for the term (output format depends on whether running in _grouped_ mode). |
- | `{glossary-brief}` | List of definitions for the term in a more compact format. |
- | `{glossary-no-dictionary}` | List of definitions for the term, except the dictionary tag is omitted. |
- | `{part-of-speech}` | Part of speech information for the term. |
- | `{phonetic-transcriptions}`| List of phonetic transcriptions for the term. |
- | `{pitch-accents}` | List of pitch accent downstep notations for the term. |
- | `{pitch-accent-graphs}` | List of pitch accent graphs for the term. |
- | `{pitch-accent-positions}` | List of accent downstep positions for the term as a number. |
- | `{pitch-accent-categories}`| List of pitch accent categories for the term (e.g. heiban, kifuku, atamadaka, odaka, nakadaka). |
- | `{reading}` | Kana reading for the term (empty for terms where the expression is the reading). |
- | `{screenshot}` | Screenshot of the web page taken at the time the term was added. |
- | `{search-query}` | The full search query shown on the search page. |
- | `{selection-text}` | The selected text on the search page or popup. |
- | `{sentence}` | Sentence, quote, or phrase that the term appears in from the source content. |
- | `{sentence-furigana}` | Sentence, quote, or phrase that the term appears in from the source content, with furigana added. |
- | `{tags}` | Grammar and usage tags providing information about the term (unavailable in _grouped_ mode). |
- | `{url}` | Address of the web page in which the term appeared in. |
+ | `{frequency-average-rank}` | The average of frequency data for the current term. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based term usage. |
+ | `{frequency-average-occurrence}` | The average of frequency data for the current term. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based term usage. |
+ | `{furigana}` | Term expressed as kanji with furigana displayed above it (e.g. 日本語). |
+ | `{furigana-plain}` | Term expressed as kanji with furigana displayed next to it in brackets (e.g. 日本語[にほんご]). |
+ | `{glossary}` | List of definitions for the term (output format depends on whether running in _grouped_ mode). |
+ | `{glossary-brief}` | List of definitions for the term in a more compact format. |
+ | `{glossary-no-dictionary}` | List of definitions for the term, except the dictionary tag is omitted. |
+ | `{part-of-speech}` | Part of speech information for the term. |
+ | `{phonetic-transcriptions}` | List of phonetic transcriptions for the term. |
+ | `{pitch-accents}` | List of pitch accent downstep notations for the term. |
+ | `{pitch-accent-graphs}` | List of pitch accent graphs for the term. |
+ | `{pitch-accent-positions}` | List of accent downstep positions for the term as a number. |
+ | `{pitch-accent-categories}` | List of pitch accent categories for the term (e.g. heiban, kifuku, atamadaka, odaka, nakadaka). |
+ | `{reading}` | Kana reading for the term (empty for terms where the expression is the reading). |
+ | `{screenshot}` | Screenshot of the web page taken at the time the term was added. |
+ | `{search-query}` | The full search query shown on the search page. |
+ | `{selection-text}` | The selected text on the search page or popup. |
+ | `{sentence}` | Sentence, quote, or phrase that the term appears in from the source content. |
+ | `{sentence-furigana}` | Sentence, quote, or phrase that the term appears in from the source content, with furigana added. |
+ | `{tags}` | Grammar and usage tags providing information about the term (unavailable in _grouped_ mode). |
+ | `{url}` | Address of the web page in which the term appeared in. |
#### Markers for Kanji Cards
- | Marker | Description |
- | --------------------- | ------------------------------------------------------------------------------------------------------------------------ |
- | `{character}` | Unicode glyph representing the current kanji. |
- | `{clipboard-image}` | An image which is stored in the system clipboard, if present. |
- | `{clipboard-text}` | Text which is stored in the system clipboard, if present. |
- | `{cloze-body}` | Raw, inflected parent term as it appeared before being reduced to dictionary form by Yomitan. |
- | `{cloze-prefix}` | Fragment of the containing `{sentence}` starting at the beginning of `{sentence}` until the beginning of `{cloze-body}`. |
- | `{cloze-suffix}` | Fragment of the containing `{sentence}` starting at the end of `{cloze-body}` until the end of `{sentence}`. |
- | `{dictionary}` | Name of the dictionary from which the card is being created. |
- | `{document-title}` | Title of the web page that the kanji appeared in. |
- | `{frequencies}` | Frequency information for the kanji. |
- | `{frequency-harmonic-rank}` | The harmonic mean of frequency data for the current kanji. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based kanji usage. |
+ | Marker | Description |
+ | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+ | `{character}` | Unicode glyph representing the current kanji. |
+ | `{clipboard-image}` | An image which is stored in the system clipboard, if present. |
+ | `{clipboard-text}` | Text which is stored in the system clipboard, if present. |
+ | `{cloze-body}` | Raw, inflected parent term as it appeared before being reduced to dictionary form by Yomitan. |
+ | `{cloze-prefix}` | Fragment of the containing `{sentence}` starting at the beginning of `{sentence}` until the beginning of `{cloze-body}`. |
+ | `{cloze-suffix}` | Fragment of the containing `{sentence}` starting at the end of `{cloze-body}` until the end of `{sentence}`. |
+ | `{dictionary}` | Name of the dictionary from which the card is being created. |
+ | `{document-title}` | Title of the web page that the kanji appeared in. |
+ | `{frequencies}` | Frequency information for the kanji. |
+ | `{frequency-harmonic-rank}` | The harmonic mean of frequency data for the current kanji. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based kanji usage. |
| `{frequency-harmonic-occurrence}` | The harmonic mean of frequency data for the current kanji. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based kanji usage. |
- | `{frequency-average-rank}` | The average of frequency data for the current kanji. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based kanji usage. |
- | `{frequency-average-occurrence}` | The average of frequency data for the current kanji. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based kanji usage. |
- | `{glossary}` | List of definitions for the kanji. |
- | `{kunyomi}` | Kunyomi (Japanese reading) for the kanji expressed as katakana. |
- | `{onyomi}` | Onyomi (Chinese reading) for the kanji expressed as hiragana. |
- | `{screenshot}` | Screenshot of the web page taken at the time the kanji was added. |
- | `{search-query}` | The full search query shown on the search page. |
- | `{selection-text}` | The selected text on the search page or popup. |
- | `{sentence}` | Sentence, quote, or phrase that the character appears in from the source content. |
- | `{sentence-furigana}` | Sentence, quote, or phrase that the character appears in from the source content, with furigana added. |
- | `{stroke-count}` | Number of strokes that the kanji character has. |
- | `{url}` | Address of the web page in which the kanji appeared in. |
+ | `{frequency-average-rank}` | The average of frequency data for the current kanji. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based kanji usage. |
+ | `{frequency-average-occurrence}` | The average of frequency data for the current kanji. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based kanji usage. |
+ | `{glossary}` | List of definitions for the kanji. |
+ | `{kunyomi}` | Kunyomi (Japanese reading) for the kanji expressed as katakana. |
+ | `{onyomi}` | Onyomi (Chinese reading) for the kanji expressed as hiragana. |
+ | `{screenshot}` | Screenshot of the web page taken at the time the kanji was added. |
+ | `{search-query}` | The full search query shown on the search page. |
+ | `{selection-text}` | The selected text on the search page or popup. |
+ | `{sentence}` | Sentence, quote, or phrase that the character appears in from the source content. |
+ | `{sentence-furigana}` | Sentence, quote, or phrase that the character appears in from the source content, with furigana added. |
+ | `{stroke-count}` | Number of strokes that the kanji character has. |
+ | `{url}` | Address of the web page in which the kanji appeared in. |
When creating your model for Yomitan, _make sure that you pick a unique field to be first_; fields that will
contain `{expression}` or `{character}` are ideal candidates for this. Anki does not allow duplicate flashcards to be
diff --git a/ext/data/schemas/options-schema.json b/ext/data/schemas/options-schema.json
index ea7caf0fcd..d86eedf733 100644
--- a/ext/data/schemas/options-schema.json
+++ b/ext/data/schemas/options-schema.json
@@ -81,6 +81,7 @@
"type": "object",
"required": [
"enable",
+ "language",
"resultOutputMode",
"debugInfo",
"maxResults",
@@ -126,6 +127,10 @@
"type": "boolean",
"default": true
},
+ "language": {
+ "type": "string",
+ "default": "ja"
+ },
"resultOutputMode": {
"type": "string",
"enum": ["group", "merge", "split"],
@@ -722,12 +727,6 @@
"translation": {
"type": "object",
"required": [
- "convertHalfWidthCharacters",
- "convertNumericCharacters",
- "convertAlphabeticCharacters",
- "convertHiraganaToKatakana",
- "convertKatakanaToHiragana",
- "collapseEmphaticSequences",
"textReplacements",
"searchResolution"
],
@@ -740,36 +739,6 @@
],
"default": "letter"
},
- "convertHalfWidthCharacters": {
- "type": "string",
- "enum": ["false", "true", "variant"],
- "default": "false"
- },
- "convertNumericCharacters": {
- "type": "string",
- "enum": ["false", "true", "variant"],
- "default": "false"
- },
- "convertAlphabeticCharacters": {
- "type": "string",
- "enum": ["false", "true", "variant"],
- "default": "false"
- },
- "convertHiraganaToKatakana": {
- "type": "string",
- "enum": ["false", "true", "variant"],
- "default": "false"
- },
- "convertKatakanaToHiragana": {
- "type": "string",
- "enum": ["false", "true", "variant"],
- "default": "variant"
- },
- "collapseEmphaticSequences": {
- "type": "string",
- "enum": ["false", "true", "full"],
- "default": "false"
- },
"textReplacements": {
"type": "object",
"required": [
diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js
index 8ab5623207..b96d81c774 100644
--- a/ext/js/background/backend.js
+++ b/ext/js/background/backend.js
@@ -34,6 +34,7 @@ import {DictionaryDatabase} from '../dictionary/dictionary-database.js';
import {Environment} from '../extension/environment.js';
import {ObjectPropertyAccessor} from '../general/object-property-accessor.js';
import {distributeFuriganaInflected, isCodePointJapanese, isStringPartiallyJapanese, convertKatakanaToHiragana as jpConvertKatakanaToHiragana} from '../language/ja/japanese.js';
+import {getLanguageSummaries} from '../language/languages.js';
import {Translator} from '../language/translator.js';
import {AudioDownloader} from '../media/audio-downloader.js';
import {getFileExtensionFromAudioMediaType, getFileExtensionFromImageMediaType} from '../media/media-util.js';
@@ -183,7 +184,8 @@ export class Backend {
['textHasJapaneseCharacters', this._onApiTextHasJapaneseCharacters.bind(this)],
['getTermFrequencies', this._onApiGetTermFrequencies.bind(this)],
['findAnkiNotes', this._onApiFindAnkiNotes.bind(this)],
- ['openCrossFramePort', this._onApiOpenCrossFramePort.bind(this)]
+ ['openCrossFramePort', this._onApiOpenCrossFramePort.bind(this)],
+ ['getLanguageSummaries', this._onApiGetLanguageSummaries.bind(this)]
]);
/* eslint-enable @stylistic/no-multi-spaces */
@@ -906,6 +908,11 @@ export class Backend {
return {targetTabId, targetFrameId};
}
+ /** @type {import('api').ApiHandler<'getLanguageSummaries'>} */
+ _onApiGetLanguageSummaries() {
+ return getLanguageSummaries();
+ }
+
// Command handlers
/**
@@ -2361,15 +2368,9 @@ export class Backend {
if (typeof deinflect !== 'boolean') { deinflect = true; }
const enabledDictionaryMap = this._getTranslatorEnabledDictionaryMap(options);
const {
- general: {mainDictionary, sortFrequencyDictionary, sortFrequencyDictionaryOrder},
+ general: {mainDictionary, sortFrequencyDictionary, sortFrequencyDictionaryOrder, language},
scanning: {alphanumeric},
translation: {
- convertHalfWidthCharacters,
- convertNumericCharacters,
- convertAlphabeticCharacters,
- convertHiraganaToKatakana,
- convertKatakanaToHiragana,
- collapseEmphaticSequences,
textReplacements: textReplacementsOptions,
searchResolution
}
@@ -2394,16 +2395,11 @@ export class Backend {
sortFrequencyDictionary,
sortFrequencyDictionaryOrder,
removeNonJapaneseCharacters: !alphanumeric,
- convertHalfWidthCharacters,
- convertNumericCharacters,
- convertAlphabeticCharacters,
- convertHiraganaToKatakana,
- convertKatakanaToHiragana,
- collapseEmphaticSequences,
searchResolution,
textReplacements,
enabledDictionaryMap,
- excludeDictionaryDefinitions
+ excludeDictionaryDefinitions,
+ language
};
}
diff --git a/ext/js/comm/api.js b/ext/js/comm/api.js
index b4fdbeb5e3..40b8e252d3 100644
--- a/ext/js/comm/api.js
+++ b/ext/js/comm/api.js
@@ -361,6 +361,13 @@ export class API {
return this._invoke('openCrossFramePort', {targetTabId, targetFrameId});
}
+ /**
+ * @returns {Promise>}
+ */
+ getLanguageSummaries() {
+ return this._invoke('getLanguageSummaries', void 0);
+ }
+
// Utilities
/**
diff --git a/ext/js/data/options-util.js b/ext/js/data/options-util.js
index 1644df2fb4..7952eafc5a 100644
--- a/ext/js/data/options-util.js
+++ b/ext/js/data/options-util.js
@@ -522,7 +522,8 @@ export class OptionsUtil {
this._updateVersion22,
this._updateVersion23,
this._updateVersion24,
- this._updateVersion25
+ this._updateVersion25,
+ this._updateVersion26
];
/* eslint-enable @typescript-eslint/unbound-method */
if (typeof targetVersion === 'number' && targetVersion < result.length) {
@@ -1154,6 +1155,31 @@ export class OptionsUtil {
}
}
+ /**
+ * - Added general.language.
+ * - Modularized text preprocessors.
+ * @type {import('options-util').UpdateFunction}
+ */
+ _updateVersion26(options) {
+ const textPreprocessors = [
+ 'convertHalfWidthCharacters',
+ 'convertNumericCharacters',
+ 'convertAlphabeticCharacters',
+ 'convertHiraganaToKatakana',
+ 'convertKatakanaToHiragana',
+ 'collapseEmphaticSequences'
+ ];
+
+ for (const {options: profileOptions} of options.profiles) {
+ profileOptions.general.language = 'ja';
+
+ for (const preprocessor of textPreprocessors) {
+ delete profileOptions.translation[preprocessor];
+ }
+ }
+ }
+
+
/**
* @param {string} url
* @returns {Promise}
diff --git a/ext/js/language/en/language-english.js b/ext/js/language/en/language-english.js
new file mode 100644
index 0000000000..8268653f41
--- /dev/null
+++ b/ext/js/language/en/language-english.js
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+import {capitalizeFirstLetter, decapitalize} from '../text-preprocessors.js';
+
+/** @type {import('language-english').EnglishLanguageDescriptor} */
+export const descriptor = {
+ name: 'English',
+ iso: 'en',
+ exampleText: 'read',
+ textPreprocessors: {
+ capitalizeFirstLetter,
+ decapitalize
+ }
+};
diff --git a/ext/js/language/ja/language-japanese.js b/ext/js/language/ja/language-japanese.js
new file mode 100644
index 0000000000..ced34bcd66
--- /dev/null
+++ b/ext/js/language/ja/language-japanese.js
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+import {convertAlphabeticToKana} from './japanese-wanakana.js';
+import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth} from './japanese.js';
+
+/** @type {import('language-japanese').JapaneseLanguageDescriptor} */
+export const descriptor = {
+ name: 'Japanese',
+ iso: 'ja',
+ exampleText: '読め',
+ textPreprocessors: {
+ convertHalfWidthCharacters: {
+ name: 'Convert half width characters to full width',
+ description: 'ヨミチャン → ヨミチャン',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction} */
+ process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str)
+ },
+ convertNumericCharacters: {
+ name: 'Convert numeric characters to full width',
+ description: '1234 → 1234',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction} */
+ process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)
+ },
+ convertAlphabeticCharacters: {
+ name: 'Convert alphabetic characters to hiragana',
+ description: 'yomichan → よみちゃん',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction} */
+ process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str)
+ },
+ convertHiraganaToKatakana: {
+ name: 'Convert hiragana to katakana',
+ description: 'よみちゃん → ヨミチャン',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction} */
+ process: (str, setting) => (setting ? convertHiraganaToKatakana(str) : str)
+ },
+ convertKatakanaToHiragana: {
+ name: 'Convert katakana to hiragana',
+ description: 'ヨミチャン → よみちゃん',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction} */
+ process: (str, setting) => (setting ? convertKatakanaToHiragana(str) : str)
+ },
+ collapseEmphaticSequences: {
+ name: 'Collapse emphatic character sequences',
+ description: 'すっっごーーい → すっごーい / すごい',
+ options: [[false, false], [true, false], [true, true]],
+ /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
+ process: (str, setting, sourceMap) => {
+ const [collapseEmphatic, collapseEmphaticFull] = setting;
+ if (collapseEmphatic) {
+ str = collapseEmphaticSequences(str, collapseEmphaticFull, sourceMap);
+ }
+ return str;
+ }
+ }
+ }
+};
diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js
new file mode 100755
index 0000000000..f51ca163bf
--- /dev/null
+++ b/ext/js/language/languages.js
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+import {descriptor as descriptorEnglish} from './en/language-english.js';
+import {descriptor as descriptorJapanese} from './ja/language-japanese.js';
+
+const languageDescriptors = [
+ descriptorEnglish,
+ descriptorJapanese
+];
+
+/** @type {Map} */
+const languageDescriptorMap = new Map();
+for (const languageDescriptor of languageDescriptors) {
+ languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor);
+}
+
+/**
+ * @returns {import('language').LanguageSummary[]}
+ */
+export function getLanguageSummaries() {
+ const results = [];
+ for (const {name, iso, exampleText} of languageDescriptorMap.values()) {
+ results.push({name, iso, exampleText});
+ }
+ return results;
+}
+
+/**
+ * @returns {import('language').LanguageAndPreprocessors[]}
+ * @throws {Error}
+ */
+export function getAllLanguageTextPreprocessors() {
+ const results = [];
+ for (const {iso, textPreprocessors} of languageDescriptorMap.values()) {
+ /** @type {import('language').TextPreprocessorWithId[]} */
+ const textPreprocessorsArray = [];
+ for (const [id, textPreprocessor] of Object.entries(textPreprocessors)) {
+ textPreprocessorsArray.push({
+ id,
+ textPreprocessor: /** @type {import('language').TextPreprocessor} */ (textPreprocessor)
+ });
+ }
+ results.push({iso, textPreprocessors: textPreprocessorsArray});
+ }
+ return results;
+}
diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js
new file mode 100755
index 0000000000..12b3d1b66d
--- /dev/null
+++ b/ext/js/language/text-preprocessors.js
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+/** @type {import('language').TextPreprocessorOptions} */
+export const basicTextPreprocessorOptions = [false, true];
+
+/** @type {import('language').TextPreprocessor} */
+export const decapitalize = {
+ name: 'Decapitalize text',
+ description: 'CAPITALIZED TEXT → capitalized text',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => (setting ? str.toLowerCase() : str)
+};
+
+/** @type {import('language').TextPreprocessor} */
+export const capitalizeFirstLetter = {
+ name: 'Capitalize first letter',
+ description: 'lowercase text → Lowercase text',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)
+};
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index 2ba1ce0d99..afba4e1386 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -18,9 +18,9 @@
import {applyTextReplacement} from '../general/regex-util.js';
import {TextSourceMap} from '../general/text-source-map.js';
-import {convertAlphabeticToKana} from './ja/japanese-wanakana.js';
-import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './ja/japanese.js';
+import {isCodePointJapanese} from './ja/japanese.js';
import {LanguageTransformer} from './language-transformer.js';
+import {getAllLanguageTextPreprocessors} from './languages.js';
/**
* Class which finds term and kanji dictionary entries for text.
@@ -41,6 +41,8 @@ export class Translator {
this._stringComparer = new Intl.Collator('en-US'); // Invariant locale
/** @type {RegExp} */
this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/;
+ /** @type {Map[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */
+ this._textPreprocessors = new Map();
}
/**
@@ -49,6 +51,14 @@ export class Translator {
*/
prepare(descriptor) {
this._languageTransformer.addDescriptor(descriptor);
+ for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) {
+ /** @type {Map>} */
+ const optionSpace = new Map();
+ for (const {id, textPreprocessor} of textPreprocessors) {
+ optionSpace.set(id, textPreprocessor.options);
+ }
+ this._textPreprocessors.set(iso, {textPreprocessors, optionSpace});
+ }
}
/**
@@ -415,51 +425,45 @@ export class Translator {
}
}
- // Deinflections and text transformations
+ // Deinflections and text preprocessing
/**
* @param {string} text
* @param {import('translation').FindTermsOptions} options
* @returns {import('translation-internal').DatabaseDeinflection[]}
+ * @throws {Error}
*/
_getAlgorithmDeinflections(text, options) {
- /** @type {import('translation-internal').TextDeinflectionOptionsArrays} */
- const textOptionVariantArray = [
- this._getTextReplacementsVariants(options),
- this._getTextOptionEntryVariants(options.convertHalfWidthCharacters),
- this._getTextOptionEntryVariants(options.convertNumericCharacters),
- this._getTextOptionEntryVariants(options.convertAlphabeticCharacters),
- this._getTextOptionEntryVariants(options.convertHiraganaToKatakana),
- this._getTextOptionEntryVariants(options.convertKatakanaToHiragana),
- this._getCollapseEmphaticOptions(options)
- ];
+ const {language} = options;
+ const info = this._textPreprocessors.get(language);
+ if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); }
+ const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info;
+
+ /** @type {Map>} */
+ const variantSpace = new Map();
+ variantSpace.set('textReplacements', this._getTextReplacementsVariants(options));
+ for (const [key, value] of textPreprocessorOptionsSpace) {
+ variantSpace.set(key, value);
+ }
/** @type {import('translation-internal').DatabaseDeinflection[]} */
const deinflections = [];
const used = new Set();
- for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of /** @type {Generator} */ (this._getArrayVariants(textOptionVariantArray))) {
+
+ for (const arrayVariant of this._generateArrayVariants(variantSpace)) {
+ const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements'));
+
let text2 = text;
const sourceMap = new TextSourceMap(text2);
+
if (textReplacements !== null) {
text2 = this._applyTextReplacements(text2, sourceMap, textReplacements);
}
- if (halfWidth) {
- text2 = convertHalfWidthKanaToFullWidth(text2, sourceMap);
- }
- if (numeric) {
- text2 = convertNumericToFullWidth(text2);
- }
- if (alphabetic) {
- text2 = convertAlphabeticToKana(text2, sourceMap);
- }
- if (katakana) {
- text2 = convertHiraganaToKatakana(text2);
- }
- if (hiragana) {
- text2 = convertKatakanaToHiragana(text2);
- }
- if (collapseEmphatic) {
- text2 = collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap);
+
+ for (const preprocessor of textPreprocessors.values()) {
+ const {id, textPreprocessor} = preprocessor;
+ const setting = arrayVariant.get(id);
+ text2 = textPreprocessor.process(text2, setting, sourceMap);
}
for (
@@ -526,36 +530,6 @@ export class Translator {
return text;
}
- /**
- * @param {import('translation').FindTermsVariantMode} value
- * @returns {boolean[]}
- */
- _getTextOptionEntryVariants(value) {
- switch (value) {
- case 'true': return [true];
- case 'variant': return [false, true];
- default: return [false];
- }
- }
-
- /**
- * @param {import('translation').FindTermsOptions} options
- * @returns {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]}
- */
- _getCollapseEmphaticOptions(options) {
- /** @type {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} */
- const collapseEmphaticOptions = [[false, false]];
- switch (options.collapseEmphaticSequences) {
- case 'true':
- collapseEmphaticOptions.push([true, false]);
- break;
- case 'full':
- collapseEmphaticOptions.push([true, false], [true, true]);
- break;
- }
- return collapseEmphaticOptions;
- }
-
/**
* @param {import('translation').FindTermsOptions} options
* @returns {(import('translation').FindTermsTextReplacement[] | null)[]}
@@ -1343,26 +1317,32 @@ export class Translator {
}
/**
- * @param {[...args: unknown[][]]} arrayVariants
- * @yields {[...args: unknown[]]}
- * @returns {Generator}
+ * @param {Map} arrayVariants
+ * @yields {Map}
+ * @returns {Generator