Skip to content

Commit

Permalink
rework text processors
Browse files Browse the repository at this point in the history
  • Loading branch information
StefanVukovic99 committed Mar 29, 2024
1 parent 2c5af21 commit 205a02a
Show file tree
Hide file tree
Showing 22 changed files with 340 additions and 687 deletions.
1 change: 0 additions & 1 deletion .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,6 @@
"ext/js/general/cache-map.js",
"ext/js/general/object-property-accessor.js",
"ext/js/general/regex-util.js",
"ext/js/general/text-source-map.js",
"ext/js/language/ar/arabic-text-preprocessors.js",
"ext/js/language/de/german-text-preprocessors.js",
"ext/js/language/en/english-transforms.js",
Expand Down
12 changes: 1 addition & 11 deletions ext/js/general/regex-util.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,12 @@ const matchReplacementPattern = /\$(?:\$|&|`|'|(\d\d?)|<([^>]*)>)/g;
* Applies string.replace using a regular expression and replacement string as arguments.
* A source map of the changes is also maintained.
* @param {string} text A string of the text to replace.
* @param {import('./text-source-map.js').TextSourceMap} sourceMap An instance of `TextSourceMap` which corresponds to `text`.
* @param {RegExp} pattern A regular expression to use as the replacement.
* @param {string} replacement A replacement string that follows the format of the standard
* JavaScript regular expression replacement string.
* @returns {string} A new string with the pattern replacements applied and the source map updated.
*/
export function applyTextReplacement(text, sourceMap, pattern, replacement) {
export function applyTextReplacement(text, pattern, replacement) {
const isGlobal = pattern.global;
if (isGlobal) { pattern.lastIndex = 0; }
for (let loop = true; loop; loop = isGlobal) {
Expand All @@ -44,15 +43,6 @@ export function applyTextReplacement(text, sourceMap, pattern, replacement) {

text = `${text.substring(0, index)}${actualReplacement}${text.substring(index + matchText.length)}`;
pattern.lastIndex += delta;

if (actualReplacementLength > 0) {
/** @type {number[]} */
const zeroes = new Array(actualReplacementLength).fill(0);
sourceMap.insert(index, ...zeroes);
sourceMap.combine(index - 1 + actualReplacementLength, matchText.length);
} else {
sourceMap.combine(index, matchText.length);
}
}
return text;
}
Expand Down
153 changes: 0 additions & 153 deletions ext/js/general/text-source-map.js

This file was deleted.

6 changes: 3 additions & 3 deletions ext/js/language/ar/arabic-text-preprocessors.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
import {basicTextProcessorOptions} from '../text-preprocessors.js';

const optionalDiacritics = [
'\u0618', // Small Fatha
Expand All @@ -38,11 +38,11 @@ const optionalDiacritics = [

const diacriticsRegex = new RegExp(`[${optionalDiacritics.join('')}]`, 'g');

/** @type {import('language').TextPreprocessor<boolean>} */
/** @type {import('language').TextProcessor<boolean>} */
export const removeArabicScriptDiacritics = {
name: 'Remove diacritics',
description: 'وَلَدَ ⬅️ ولد',
options: basicTextPreprocessorOptions,
options: basicTextProcessorOptions,
process: (text, setting) => {
return setting ? text.replace(diacriticsRegex, '') : text;
}
Expand Down
24 changes: 12 additions & 12 deletions ext/js/language/ja/japanese-text-preprocessors.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
import {basicTextProcessorOptions} from '../text-preprocessors.js';
import {convertAlphabeticToKana} from './japanese-wanakana.js';
import {
collapseEmphaticSequences as collapseEmphaticSequencesFunction,
Expand All @@ -25,28 +25,28 @@ import {
convertNumericToFullWidth
} from './japanese.js';

/** @type {import('language').TextPreprocessor<boolean>} */
/** @type {import('language').TextProcessor<boolean>} */
export const convertHalfWidthCharacters = {
name: 'Convert half width characters to full width',
description: 'ヨミチャン → ヨミチャン',
options: basicTextPreprocessorOptions,
process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str)
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertHalfWidthKanaToFullWidth(str) : str)
};

/** @type {import('language').TextPreprocessor<boolean>} */
/** @type {import('language').TextProcessor<boolean>} */
export const convertNumericCharacters = {
name: 'Convert numeric characters to full width',
description: '1234 → 1234',
options: basicTextPreprocessorOptions,
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)
};

/** @type {import('language').TextPreprocessor<boolean>} */
/** @type {import('language').TextProcessor<boolean>} */
export const convertAlphabeticCharacters = {
name: 'Convert alphabetic characters to hiragana',
description: 'yomichan → よみちゃん',
options: basicTextPreprocessorOptions,
process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str)
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertAlphabeticToKana(str) : str)
};

/** @type {import('language').BidirectionalConversionPreprocessor} */
Expand All @@ -66,15 +66,15 @@ export const convertHiraganaToKatakana = {
}
};

/** @type {import('language').TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
/** @type {import('language').TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
export const collapseEmphaticSequences = {
name: 'Collapse emphatic character sequences',
description: 'すっっごーーい → すっごーい / すごい',
options: [[false, false], [true, false], [true, true]],
process: (str, setting, sourceMap) => {
process: (str, setting) => {
const [collapseEmphatic, collapseEmphaticFull] = setting;
if (collapseEmphatic) {
str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull, sourceMap);
str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull);
}
return str;
}
Expand Down
52 changes: 5 additions & 47 deletions ext/js/language/ja/japanese-wanakana.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,51 +19,10 @@ import * as wanakana from '../../../lib/wanakana.js';

/**
* @param {string} text
* @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap
* @param {number} sourceMapStart
* @returns {string}
*/
function convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) {
const result = wanakana.toHiragana(text);

// Generate source mapping
if (sourceMap !== null) {
let i = 0;
let resultPos = 0;
const ii = text.length;
while (i < ii) {
// Find smallest matching substring
let iNext = i + 1;
let resultPosNext = result.length;
while (iNext < ii) {
const t = wanakana.toHiragana(text.substring(0, iNext));
if (t === result.substring(0, t.length)) {
resultPosNext = t.length;
break;
}
++iNext;
}

// Merge characters
const removals = iNext - i - 1;
if (removals > 0) {
sourceMap.combine(sourceMapStart, removals);
}
++sourceMapStart;

// Empty elements
const additions = resultPosNext - resultPos - 1;
for (let j = 0; j < additions; ++j) {
sourceMap.insert(sourceMapStart, 0);
++sourceMapStart;
}

i = iNext;
resultPos = resultPosNext;
}
}

return result;
function convertAlphabeticPartToKana(text) {
return wanakana.toHiragana(text);
}

/**
Expand All @@ -84,10 +43,9 @@ export function convertToRomaji(text) {

/**
* @param {string} text
* @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap
* @returns {string}
*/
export function convertAlphabeticToKana(text, sourceMap = null) {
export function convertAlphabeticToKana(text) {
let part = '';
let result = '';

Expand All @@ -106,7 +64,7 @@ export function convertAlphabeticToKana(text, sourceMap = null) {
c = 0x2d; // '-'
} else {
if (part.length > 0) {
result += convertAlphabeticPartToKana(part, sourceMap, result.length);
result += convertAlphabeticPartToKana(part);
part = '';
}
result += char;
Expand All @@ -116,7 +74,7 @@ export function convertAlphabeticToKana(text, sourceMap = null) {
}

if (part.length > 0) {
result += convertAlphabeticPartToKana(part, sourceMap, result.length);
result += convertAlphabeticPartToKana(part);
}
return result;
}
Loading

0 comments on commit 205a02a

Please sign in to comment.