Skip to content

Commit

Permalink
Normalize CJK Compatibility characters (#1618)
Browse files Browse the repository at this point in the history
* Normalize CJK_COMPATIBILITY characters

* Add tests for CJK_COMPATIBILITY normalization

* Use isCodePointInRange instead of isCodePointInRanges
  • Loading branch information
Kuuuube authored Nov 27, 2024
1 parent 22d5f41 commit f0b7009
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 1 deletion.
3 changes: 3 additions & 0 deletions ext/js/language/CJK-util.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ export const FULLWIDTH_CHARACTER_RANGES = [
/** @type {import('CJK-util').CodepointRange} */
export const CJK_PUNCTUATION_RANGE = [0x3000, 0x303f];

/** @type {import('CJK-util').CodepointRange} */
export const CJK_COMPATIBILITY = [0x3300, 0x33ff];

/**
* @param {number} codePoint
* @param {import('CJK-util').CodepointRange} range
Expand Down
9 changes: 9 additions & 0 deletions ext/js/language/ja/japanese-text-preprocessors.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {
convertHalfWidthKanaToFullWidth,
convertHiraganaToKatakana as convertHiraganaToKatakanaFunction,
convertKatakanaToHiragana as convertKatakanaToHiraganaFunction,
normalizeCJKCompatibilityCharacters as normalizeCJKCompatibilityCharactersFunction,
normalizeCombiningCharacters as normalizeCombiningCharactersFunction,
} from './japanese.js';

Expand Down Expand Up @@ -99,3 +100,11 @@ export const normalizeCombiningCharacters = {
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str),
};

/** @type {import('language').TextProcessor<boolean>} */
export const normalizeCJKCompatibilityCharacters = {
name: 'Normalize CJK Compatibility Characters',
description: '㌀ → アパート',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? normalizeCJKCompatibilityCharactersFunction(str) : str),
};
15 changes: 14 additions & 1 deletion ext/js/language/ja/japanese.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import {CJK_IDEOGRAPH_RANGES, isCodePointInRange, isCodePointInRanges} from '../CJK-util.js';
import {CJK_COMPATIBILITY, CJK_IDEOGRAPH_RANGES, isCodePointInRange, isCodePointInRanges} from '../CJK-util.js';


const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
Expand Down Expand Up @@ -621,6 +621,19 @@ export function normalizeCombiningCharacters(text) {
return result;
}

/**
* @param {string} text
* @returns {string}
*/
export function normalizeCJKCompatibilityCharacters(text) {
let result = '';
for (let i = 0; i < text.length; i++) {
const codePoint = text[i].codePointAt(0);
result += codePoint && isCodePointInRange(codePoint, CJK_COMPATIBILITY) ? text[i].normalize('NFKD') : text[i];
}
return result;
}

// Furigana distribution

/**
Expand Down
2 changes: 2 additions & 0 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import {
collapseEmphaticSequences,
convertHalfWidthCharacters,
convertHiraganaToKatakana,
normalizeCJKCompatibilityCharacters,
normalizeCombiningCharacters,
} from './ja/japanese-text-preprocessors.js';
import {japaneseTransforms} from './ja/japanese-transforms.js';
Expand Down Expand Up @@ -213,6 +214,7 @@ const languageDescriptors = [
convertHalfWidthCharacters,
alphabeticToHiragana,
normalizeCombiningCharacters,
normalizeCJKCompatibilityCharacters,
normalizeRadicalCharacters,
alphanumericWidthVariants,
convertHiraganaToKatakana,
Expand Down
158 changes: 158 additions & 0 deletions test/japanese-util.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1067,3 +1067,161 @@ describe('combining dakuten/handakuten normalization', () => {
expect(jp.normalizeCombiningCharacters(input)).toStrictEqual(expected);
});
});

describe('cjk compatibility characters normalization', () => {
const testCases = [
['㌀', 'アパート'],
['㌁', 'アルファ'],
['㌂', 'アンペア'],
['㌃', 'アール'],
['㌄', 'イニング'],
['㌅', 'インチ'],
['㌆', 'ウォン'],
['㌇', 'エスクード'],
['㌈', 'エーカー'],
['㌉', 'オンス'],
['㌊', 'オーム'],
['㌋', 'カイリ'],
['㌌', 'カラット'],
['㌍', 'カロリー'],
['㌎', 'ガロン'],
['㌏', 'ガンマ'],
['㌐', 'ギガ'],
['㌑', 'ギニー'],
['㌒', 'キュリー'],
['㌓', 'ギルダー'],
['㌔', 'キロ'],
['㌕', 'キログラム'],
['㌖', 'キロメートル'],
['㌗', 'キロワット'],
['㌘', 'グラム'],
['㌙', 'グラムトン'],
['㌚', 'クルゼイロ'],
['㌛', 'クローネ'],
['㌜', 'ケース'],
['㌝', 'コルナ'],
['㌞', 'コーポ'],
['㌟', 'サイクル'],
['㌠', 'サンチーム'],
['㌡', 'シリング'],
['㌢', 'センチ'],
['㌣', 'セント'],
['㌤', 'ダース'],
['㌥', 'デシ'],
['㌦', 'ドル'],
['㌧', 'トン'],
['㌨', 'ナノ'],
['㌩', 'ノット'],
['㌪', 'ハイツ'],
['㌫', 'パーセント'],
['㌬', 'パーツ'],
['㌭', 'バーレル'],
['㌮', 'ピアストル'],
['㌯', 'ピクル'],
['㌰', 'ピコ'],
['㌱', 'ビル'],
['㌲', 'ファラッド'],
['㌳', 'フィート'],
['㌴', 'ブッシェル'],
['㌵', 'フラン'],
['㌶', 'ヘクタール'],
['㌷', 'ペソ'],
['㌸', 'ペニヒ'],
['㌹', 'ヘルツ'],
['㌺', 'ペンス'],
['㌻', 'ページ'],
['㌼', 'ベータ'],
['㌽', 'ポイント'],
['㌾', 'ボルト'],
['㌿', 'ホン'],
['㍀', 'ポンド'],
['㍁', 'ホール'],
['㍂', 'ホーン'],
['㍃', 'マイクロ'],
['㍄', 'マイル'],
['㍅', 'マッハ'],
['㍆', 'マルク'],
['㍇', 'マンション'],
['㍈', 'ミクロン'],
['㍉', 'ミリ'],
['㍊', 'ミリバール'],
['㍋', 'メガ'],
['㍌', 'メガトン'],
['㍍', 'メートル'],
['㍎', 'ヤード'],
['㍏', 'ヤール'],
['㍐', 'ユアン'],
['㍑', 'リットル'],
['㍒', 'リラ'],
['㍓', 'ルピー'],
['㍔', 'ルーブル'],
['㍕', 'レム'],
['㍖', 'レントゲン'],
['㍗', 'ワット'],
['㍘', '0点'],
['㍙', '1点'],
['㍚', '2点'],
['㍛', '3点'],
['㍜', '4点'],
['㍝', '5点'],
['㍞', '6点'],
['㍟', '7点'],
['㍠', '8点'],
['㍡', '9点'],
['㍢', '10点'],
['㍣', '11点'],
['㍤', '12点'],
['㍥', '13点'],
['㍦', '14点'],
['㍧', '15点'],
['㍨', '16点'],
['㍩', '17点'],
['㍪', '18点'],
['㍫', '19点'],
['㍬', '20点'],
['㍭', '21点'],
['㍮', '22点'],
['㍯', '23点'],
['㍰', '24点'],
['㍻', '平成'],
['㍼', '昭和'],
['㍽', '大正'],
['㍾', '明治'],
['㍿', '株式会社'],
['㏠', '1日'],
['㏡', '2日'],
['㏢', '3日'],
['㏣', '4日'],
['㏤', '5日'],
['㏥', '6日'],
['㏦', '7日'],
['㏧', '8日'],
['㏨', '9日'],
['㏩', '10日'],
['㏪', '11日'],
['㏫', '12日'],
['㏬', '13日'],
['㏭', '14日'],
['㏮', '15日'],
['㏯', '16日'],
['㏰', '17日'],
['㏱', '18日'],
['㏲', '19日'],
['㏳', '20日'],
['㏴', '21日'],
['㏵', '22日'],
['㏶', '23日'],
['㏷', '24日'],
['㏸', '25日'],
['㏹', '26日'],
['㏺', '27日'],
['㏻', '28日'],
['㏼', '29日'],
['㏽', '30日'],
['㏾', '31日'],
];

test.each(testCases)('%s normalizes to %s', (input, expected) => {
expect(jp.normalizeCJKCompatibilityCharacters(input)).toStrictEqual(expected);
});
});
1 change: 1 addition & 0 deletions types/ext/language-descriptors.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ type AllTextProcessors = {
convertHalfWidthCharacters: TextProcessor<boolean>;
alphabeticToHiragana: TextProcessor<boolean>;
normalizeCombiningCharacters: TextProcessor<boolean>;
normalizeCJKCompatibilityCharacters: TextProcessor<boolean>;
normalizeRadicalCharacters: TextProcessor<boolean>;
alphanumericWidthVariants: BidirectionalConversionPreprocessor;
convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
Expand Down

0 comments on commit f0b7009

Please sign in to comment.