diff --git a/.eslintrc.json b/.eslintrc.json index 62ee61da60..d19428666e 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -638,6 +638,7 @@ "ext/js/language/en/english-transforms.js", "ext/js/language/es/spanish-transforms.js", "ext/js/language/ja/japanese-text-preprocessors.js", + "ext/js/language/ja/shinjitai-converter.js", "ext/js/language/ja/japanese-transforms.js", "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index cdd8ce9ad9..62bf626c2d 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -26,6 +26,7 @@ import { convertKatakanaToHiragana as convertKatakanaToHiraganaFunction, normalizeCombiningCharacters as normalizeCombiningCharactersFunction, } from './japanese.js'; +import {convertShinjitai as convertShinjitaiFunction} from './shinjitai-converter.js'; /** @type {import('language').TextProcessor} */ export const convertHalfWidthCharacters = { @@ -99,3 +100,11 @@ export const normalizeCombiningCharacters = { options: basicTextProcessorOptions, process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str), }; + +/** @type {import('language').TextProcessor} */ +export const convertShinjitai = { + name: 'Convert Kyujitai to Shinjitai', + description: '萬 → 万', + options: basicTextProcessorOptions, + process: (str, setting) => (setting ? convertShinjitaiFunction(str) : str), +}; diff --git a/ext/js/language/ja/shinjitai-converter.js b/ext/js/language/ja/shinjitai-converter.js new file mode 100644 index 0000000000..430d999089 --- /dev/null +++ b/ext/js/language/ja/shinjitai-converter.js @@ -0,0 +1,584 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** + * Map used to convert 旧字体 (Kyujitai) to 新字体 (Shinjitai), compiled from various sources. + * This map does not include transformations caused by the 同音による書き換え reform. + * This map does not include 俗字, 別体, 誤字 or other uncommon forms or variants. + */ +const shinjitaiMap = new Map([ + // A source comment signifies that the entries below are from that source and not included in any of the above ones. + + /* https://github.com/DrTurnon/kyujipy */ + + // 常用漢字 + ['亞', '亜'], + ['惡', '悪'], + ['壓', '圧'], + ['圍', '囲'], + ['醫', '医'], + ['爲', '為'], + ['壹', '壱'], + ['逸', '逸'], + ['飮', '飲'], + ['隱', '隠'], + ['羽', '羽'], + ['榮', '栄'], + ['營', '営'], + ['銳', '鋭'], + ['衞', '衛'], + ['益', '益'], + ['驛', '駅'], + ['悅', '悦'], + ['謁', '謁'], + ['閱', '閲'], + ['圓', '円'], + ['鹽', '塩'], + ['緣', '縁'], + ['艷', '艶'], + ['應', '応'], + ['歐', '欧'], + ['毆', '殴'], + ['櫻', '桜'], + ['奧', '奥'], + ['橫', '横'], + ['溫', '温'], + ['穩', '穏'], + ['假', '仮'], + ['價', '価'], + ['禍', '禍'], + ['畫', '画'], + ['會', '会'], + ['悔', '悔'], + ['海', '海'], + ['繪', '絵'], + ['壞', '壊'], + ['懷', '懐'], + ['慨', '慨'], + ['槪', '概'], + ['擴', '拡'], + ['殼', '殻'], + ['覺', '覚'], + ['學', '学'], + ['嶽', '岳'], + ['樂', '楽'], + ['喝', '喝'], + ['渴', '渇'], + ['褐', '褐'], + ['罐', '缶'], + ['卷', '巻'], + ['陷', '陥'], + ['勸', '勧'], + ['寬', '寛'], + ['漢', '漢'], + ['關', '関'], + ['歡', '歓'], + ['館', '館'], + ['觀', '観'], + ['顏', '顔'], + ['氣', '気'], + ['祈', '祈'], + ['既', '既'], + ['歸', '帰'], + ['龜', '亀'], + ['器', '器'], + ['僞', '偽'], + ['戲', '戯'], + ['犧', '犠'], + ['舊', '旧'], + ['據', '拠'], + ['擧', '挙'], + ['虛', '虚'], + ['峽', '峡'], + ['挾', '挟'], + ['狹', '狭'], + ['敎', '教'], + ['鄕', '郷'], + ['響', '響'], + ['曉', '暁'], + ['勤', '勤'], + ['謹', '謹'], + ['區', '区'], + ['驅', '駆'], + ['勳', '勲'], + ['薰', '薫'], + ['徑', '径'], + ['莖', '茎'], + ['契', '契'], + ['惠', '恵'], + ['揭', '掲'], + ['溪', '渓'], + ['經', '経'], + ['螢', '蛍'], + ['輕', '軽'], + ['繼', '継'], + ['鷄', '鶏'], + ['藝', '芸'], + ['擊', '撃'], + ['缺', '欠'], + ['硏', '研'], + ['縣', '県'], + ['儉', '倹'], + ['劍', '剣'], + ['險', '険'], + ['圈', '圏'], + ['檢', '検'], + ['獻', '献'], + ['權', '権'], + ['顯', '顕'], + ['驗', '験'], + ['嚴', '厳'], + ['戶', '戸'], + ['吳', '呉'], + ['娛', '娯'], + ['廣', '広'], + ['效', '効'], + ['恆', '恒'], + ['黃', '黄'], + ['鑛', '鉱'], + ['號', '号'], + ['吿', '告'], + ['國', '国'], + ['黑', '黒'], + ['穀', '穀'], + ['碎', '砕'], + ['濟', '済'], + ['齋', '斎'], + ['歲', '歳'], + ['劑', '剤'], + ['殺', '殺'], + ['雜', '雑'], + ['參', '参'], + ['棧', '桟'], + ['蠶', '蚕'], + ['慘', '惨'], + ['產', '産'], + ['贊', '賛'], + ['殘', '残'], + ['絲', '糸'], + ['祉', '祉'], + ['視', '視'], + ['齒', '歯'], + ['飼', '飼'], + ['兒', '児'], + ['辭', '辞'], + ['濕', '湿'], + ['實', '実'], + ['寫', '写'], + ['社', '社'], + ['舍', '舎'], + ['者', '者'], + ['煮', '煮'], + ['釋', '釈'], + ['壽', '寿'], + ['收', '収'], + ['臭', '臭'], + ['從', '従'], + ['澁', '渋'], + ['獸', '獣'], + ['縱', '縦'], + ['祝', '祝'], + ['肅', '粛'], + ['處', '処'], + ['暑', '暑'], + ['署', '署'], + ['緖', '緒'], + ['諸', '諸'], + ['敍', '叙'], + ['尙', '尚'], + ['將', '将'], + ['祥', '祥'], + ['稱', '称'], + ['涉', '渉'], + ['燒', '焼'], + ['證', '証'], + ['奬', '奨'], + ['條', '条'], + ['狀', '状'], + ['乘', '乗'], + ['淨', '浄'], + ['剩', '剰'], + ['疊', '畳'], + ['繩', '縄'], + ['壤', '壌'], + ['孃', '嬢'], + ['讓', '譲'], + ['釀', '醸'], + ['觸', '触'], + ['囑', '嘱'], + ['神', '神'], + ['眞', '真'], + ['寢', '寝'], + ['愼', '慎'], + ['盡', '尽'], + ['圖', '図'], + ['粹', '粋'], + ['醉', '酔'], + ['穗', '穂'], + ['隨', '随'], + ['髓', '髄'], + ['樞', '枢'], + ['數', '数'], + ['瀨', '瀬'], + ['聲', '声'], + ['靑', '青'], + ['齊', '斉'], + ['淸', '清'], + ['晴', '晴'], + ['精', '精'], + ['靜', '静'], + ['稅', '税'], + ['竊', '窃'], + ['攝', '摂'], + ['節', '節'], + ['說', '説'], + ['絕', '絶'], + ['專', '専'], + ['淺', '浅'], + ['戰', '戦'], + ['踐', '践'], + ['錢', '銭'], + ['潛', '潜'], + ['纖', '繊'], + ['禪', '禅'], + ['祖', '祖'], + ['雙', '双'], + ['壯', '壮'], + ['爭', '争'], + ['莊', '荘'], + ['搜', '捜'], + ['插', '挿'], + ['巢', '巣'], + ['曾', '曽'], + ['瘦', '痩'], + ['裝', '装'], + ['僧', '僧'], + ['層', '層'], + ['總', '総'], + ['騷', '騒'], + ['增', '増'], + ['憎', '憎'], + ['藏', '蔵'], + ['贈', '贈'], + ['臟', '臓'], + ['卽', '即'], + ['屬', '属'], + ['續', '続'], + ['墮', '堕'], + ['對', '対'], + ['體', '体'], + ['帶', '帯'], + ['滯', '滞'], + ['臺', '台'], + ['瀧', '滝'], + ['擇', '択'], + ['澤', '沢'], + ['脫', '脱'], + ['擔', '担'], + ['單', '単'], + ['膽', '胆'], + ['嘆', '嘆'], + ['團', '団'], + ['斷', '断'], + ['彈', '弾'], + ['遲', '遅'], + ['癡', '痴'], + ['蟲', '虫'], + ['晝', '昼'], + ['鑄', '鋳'], + ['著', '著'], + ['廳', '庁'], + ['徵', '徴'], + ['聽', '聴'], + ['懲', '懲'], + ['敕', '勅'], + ['鎭', '鎮'], + ['塚', '塚'], + ['遞', '逓'], + ['鐵', '鉄'], + ['點', '点'], + ['轉', '転'], + ['傳', '伝'], + ['都', '都'], + ['燈', '灯'], + ['當', '当'], + ['黨', '党'], + ['盜', '盗'], + ['稻', '稲'], + ['鬭', '闘'], + ['德', '徳'], + ['獨', '独'], + ['讀', '読'], + ['突', '突'], + ['屆', '届'], + ['內', '内'], + ['難', '難'], + ['貳', '弐'], + ['腦', '悩'], + ['腦', '脳'], + ['霸', '覇'], + ['拜', '拝'], + ['廢', '廃'], + ['賣', '売'], + ['梅', '梅'], + ['麥', '麦'], + ['發', '発'], + ['髮', '髪'], + ['拔', '抜'], + ['飯', '飯'], + ['繁', '繁'], + ['晚', '晩'], + ['蠻', '蛮'], + ['卑', '卑'], + ['祕', '秘'], + ['碑', '碑'], + ['濱', '浜'], + ['賓', '賓'], + ['頻', '頻'], + ['敏', '敏'], + ['甁', '瓶'], + ['侮', '侮'], + ['福', '福'], + ['拂', '払'], + ['佛', '仏'], + ['倂', '併'], + ['竝', '並'], + ['塀', '塀'], + ['餠', '餅'], + ['邊', '辺'], + ['變', '変'], + ['勉', '勉'], + ['步', '歩'], + ['舖', '舗'], + ['寶', '宝'], + ['豐', '豊'], + ['襃', '褒'], + ['墨', '墨'], + ['沒', '没'], + ['飜', '翻'], + ['每', '毎'], + ['萬', '万'], + ['滿', '満'], + ['免', '免'], + ['麵', '麺'], + ['默', '黙'], + ['彌', '弥'], + ['譯', '訳'], + ['藥', '薬'], + ['與', '与'], + ['豫', '予'], + ['餘', '余'], + ['譽', '誉'], + ['搖', '揺'], + ['樣', '様'], + ['謠', '謡'], + ['來', '来'], + ['賴', '頼'], + ['亂', '乱'], + ['覽', '覧'], + ['欄', '欄'], + ['龍', '竜'], + ['隆', '隆'], + ['旅', '旅'], + ['虜', '虜'], + ['兩', '両'], + ['獵', '猟'], + ['綠', '緑'], + ['淚', '涙'], + ['壘', '塁'], + ['類', '類'], + ['禮', '礼'], + ['勵', '励'], + ['戾', '戻'], + ['靈', '霊'], + ['隸', '隷'], + ['齡', '齢'], + ['曆', '暦'], + ['歷', '歴'], + ['戀', '恋'], + ['連', '連'], + ['廉', '廉'], + ['練', '練'], + ['鍊', '錬'], + ['爐', '炉'], + ['勞', '労'], + ['郞', '郎'], + ['朗', '朗'], + ['廊', '廊'], + ['樓', '楼'], + ['籠', '篭'], + ['錄', '録'], + ['灣', '湾'], + + // 人名用漢字 + ['堯', '尭'], + ['巖', '巌'], + ['摑', '掴'], + ['彥', '彦'], + ['檜', '桧'], + ['槇', '槙'], + ['渚', '渚'], + ['猪', '猪'], + ['琢', '琢'], + ['瑤', '瑶'], + ['禰', '祢'], + ['祐', '祐'], + ['禱', '祷'], + ['祿', '禄'], + ['禎', '禎'], + ['穰', '穣'], + ['簞', '箪'], + ['聰', '聡'], + ['蓮', '蓮'], + ['蘭', '蘭'], + ['遙', '遥'], + ['遼', '遼'], + ['靖', '靖'], + + // 表外字 (擴張新字體) + ['蘒', '蘒'], + ['啞', '唖'], + ['噓', '嘘'], + ['穎', '頴'], + ['鷗', '鴎'], + ['軀', '躯'], + ['鶯', '鴬'], + ['攪', '撹'], + ['麴', '麹'], + ['鹼', '鹸'], + ['嚙', '噛'], + ['繡', '繍'], + ['蔣', '蒋'], + ['醬', '醤'], + ['搔', '掻'], + ['屛', '屏'], + ['幷', '并'], + ['濾', '沪'], + ['蘆', '芦'], + ['蠟', '蝋'], + ['彎', '弯'], + ['焰', '焔'], + ['礦', '砿'], + ['讚', '讃'], + ['顚', '顛'], + ['巓', '巔'], + ['醱', '醗'], + ['潑', '溌'], + ['輛', '輌'], + ['繫', '繋'], + ['瀆', '涜'], + ['儘', '侭'], + ['藪', '薮'], + ['蠅', '蝿'], + ['嬀', '媯'], + ['驒', '騨'], + + // variants + + // 常用漢字 + ['鬥', '闘'], + ['鬪', '闘'], + ['鬬', '闘'], + + // 人名用漢字 + ['亙', '亘'], + ['凜', '凛'], + ['晄', '晃'], + ['晉', '晋'], + ['萠', '萌'], + + /* 新字源 */ + + ['冬', '冬'], + ['割', '割'], + ['勇', '勇'], + ['周', '周'], + ['噴', '噴'], + ['城', '城'], + ['墳', '墳'], + ['奔', '奔'], + ['姬', '姫'], + ['寧', '寧'], + ['瓣', '弁'], + ['辨', '弁'], + ['辯', '弁'], + ['彫', '彫'], + ['惱', '悩'], + ['慈', '慈'], + ['憤', '憤'], + ['憲', '憲'], + ['成', '成'], + ['戴', '戴'], + ['搜', '捜'], + ['滋', '滋'], + ['潮', '潮'], + ['炭', '炭'], + ['爵', '爵'], + ['異', '異'], + ['盛', '盛'], + ['𥔵', '磁'], + ['𥳑', '簡'], + ['糖', '糖'], + ['𦤶', '致'], + ['芽', '芽'], + ['若', '若'], + ['茶', '茶'], + ['華', '華'], + ['落', '落'], + ['葉', '葉'], + ['藍', '藍'], + ['覆', '覆'], + ['諭', '諭'], + ['諾', '諾'], + ['輸', '輸'], + ['閒', '間'], + ['降', '降'], + + /* 三省堂 */ + ['充', '充'], + ['册', '冊'], + ['勺', '勺'], + ['巽', '巽'], + ['强', '強'], + ['旣', '既'], + ['流', '流'], + ['浩', '浩'], + ['煕', '熙'], + + /* 大修館 */ + ['兔', '兎'], + ['廚', '厨'], + ['廏', '厩'], + ['壻', '婿'], + ['槪', '概'], +]); + +let regexString = ''; +for (const [kyuji] of shinjitaiMap) { + regexString += kyuji; +} +const regex = new RegExp('[' + regexString + ']', 'g'); + +/** + * @param {string} str + * @returns {string} + */ +export function convertShinjitai(str) { + return str.replace(regex, (c) => { + return shinjitaiMap.get(c) ?? c; + }); +} diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 8277cee8bf..2b8b3f9b17 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -27,6 +27,7 @@ import { convertHalfWidthCharacters, convertHiraganaToKatakana, normalizeCombiningCharacters, + convertShinjitai, } from './ja/japanese-text-preprocessors.js'; import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; @@ -200,6 +201,7 @@ const languageDescriptors = [ alphanumericWidthVariants, convertHiraganaToKatakana, collapseEmphaticSequences, + convertShinjitai, }, languageTransforms: japaneseTransforms, }, diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index bcb1dfac2b..d9c95cd710 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -134,6 +134,7 @@ type AllTextProcessors = { alphanumericWidthVariants: BidirectionalConversionPreprocessor; convertHiraganaToKatakana: BidirectionalConversionPreprocessor; collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>; + convertShinjitai: TextProcessor; }; }; ko: {