diff --git a/built/t/t1.d.ts b/built/t/t1.d.ts new file mode 100644 index 00000000..cb0ff5c3 --- /dev/null +++ b/built/t/t1.d.ts @@ -0,0 +1 @@ +export {}; diff --git a/built/t/t1.js b/built/t/t1.js new file mode 100644 index 00000000..63241b79 --- /dev/null +++ b/built/t/t1.js @@ -0,0 +1,57 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +Object.defineProperty(exports, "__esModule", { value: true }); +const fs_1 = require("fs"); +const encoding_1 = require("../utils/encoding"); +function main() { + return __awaiter(this, void 0, void 0, function* () { + //p1(`${__dirname}/../../sample/sjis.html`); + p1(`${__dirname}/../../sample/_jis.html`); + }); +} +function p1(file) { + const content = (0, fs_1.readFileSync)(file); + const enc = (0, encoding_1.detectEncoding)(content); + console.log('enc', enc); + const count = 10000; + /* + console.log(`iconv-lite`); + { + const t0 = performance.now(); + for (let i=0;i { + console.log('Done'); +}); diff --git a/built/utils/encoding.d.ts b/built/utils/encoding.d.ts index 3f7da7be..061689a4 100644 --- a/built/utils/encoding.d.ts +++ b/built/utils/encoding.d.ts @@ -5,3 +5,5 @@ */ export declare function detectEncoding(body: Buffer): string; export declare function toUtf8(body: Buffer, encoding: string): string; +export declare function toUtf8i(body: Buffer, encoding: string): string; +export declare function toUtf8j(body: Buffer, encoding: string): string; diff --git a/built/utils/encoding.js b/built/utils/encoding.js index 3ad1b192..b09cbdb9 100644 --- a/built/utils/encoding.js +++ b/built/utils/encoding.js @@ -2,8 +2,13 @@ Object.defineProperty(exports, "__esModule", { value: true }); exports.detectEncoding = detectEncoding; exports.toUtf8 = toUtf8; +exports.toUtf8i = toUtf8i; +exports.toUtf8j = toUtf8j; const iconv = require("iconv-lite"); +//var Iconv = require('iconv').Iconv; +const iconv_1 = require("iconv"); const jschardet = require("jschardet"); +const Encoding = require('encoding-japanese'); const regCharset = new RegExp(/charset\s*=\s*["']?([\w-]+)/, 'i'); /** * Detect HTML encoding @@ -32,7 +37,16 @@ function detectEncoding(body) { function toUtf8(body, encoding) { return iconv.decode(body, encoding); } +function toUtf8i(body, encoding) { + const i = new iconv_1.Iconv(encoding, 'UTF-8'); + return i.convert(body).toString(); +} +function toUtf8j(body, encoding) { + return Encoding.codeToString(Encoding.convert(body, 'UNICODE', encoding)); +} function toEncoding(candicate) { + if (candicate.toUpperCase() === 'ISO-2022-JP') + return 'ISO-2022-JP'; if (iconv.encodingExists(candicate)) { if (['shift_jis', 'shift-jis', 'windows-31j', 'x-sjis'].includes(candicate.toLowerCase())) return 'cp932'; diff --git a/package.json b/package.json index 16276b42..dec7e2cb 100644 --- a/package.json +++ b/package.json @@ -24,11 +24,13 @@ }, "dependencies": { "cheerio": "1.0.0", + "encoding-japanese": "2.2.0", "escape-regexp": "0.0.1", "got": "11.8.6", "h3": "1.13.0", "h3-typebox": "0.6.0", "html-entities": "2.5.2", + "iconv": "3.0.1", "iconv-lite": "0.6.3", "js-yaml": "4.1.0", "jschardet": "3.1.4", diff --git a/src/t/t1.ts b/src/t/t1.ts new file mode 100644 index 00000000..6f45bacd --- /dev/null +++ b/src/t/t1.ts @@ -0,0 +1,54 @@ +import { readFile, readFileSync } from 'fs'; +import { detectEncoding, toUtf8, toUtf8i, toUtf8j } from '../utils/encoding'; + + +async function main() { + //p1(`${__dirname}/../../sample/sjis.html`); + p1(`${__dirname}/../../sample/_jis.html`); +} + +function p1(file: string) { + const content = readFileSync(file); + const enc = detectEncoding(content); + console.log('enc', enc); + + const count = 10000; + + /* + console.log(`iconv-lite`); + { + const t0 = performance.now(); + for (let i=0;i { + console.log('Done'); +}); diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts index d6a2cef4..8fbaeca7 100644 --- a/src/utils/encoding.ts +++ b/src/utils/encoding.ts @@ -1,5 +1,8 @@ import * as iconv from 'iconv-lite'; +//var Iconv = require('iconv').Iconv; +import { Iconv } from 'iconv'; import * as jschardet from 'jschardet'; +const Encoding = require('encoding-japanese'); const regCharset = new RegExp(/charset\s*=\s*["']?([\w-]+)/, 'i'); @@ -32,7 +35,19 @@ export function toUtf8(body: Buffer, encoding: string): string { return iconv.decode(body, encoding); } +export function toUtf8i(body: Buffer, encoding: string): string { + const i = new Iconv(encoding, 'UTF-8'); + return i.convert(body).toString(); +} + +export function toUtf8j(body: Buffer, encoding: string): string { + return Encoding.codeToString(Encoding.convert(body, 'UNICODE', encoding)) +} + + function toEncoding(candicate: string): string | null { + if (candicate.toUpperCase() === 'ISO-2022-JP') return 'ISO-2022-JP'; + if (iconv.encodingExists(candicate)) { if (['shift_jis', 'shift-jis', 'windows-31j', 'x-sjis'].includes(candicate.toLowerCase())) return 'cp932'; return candicate; diff --git a/yarn.lock b/yarn.lock index b7dac2da..33dc43cd 100644 --- a/yarn.lock +++ b/yarn.lock @@ -331,6 +331,11 @@ domutils@^3.0.1, domutils@^3.1.0: domelementtype "^2.3.0" domhandler "^5.0.3" +encoding-japanese@2.2.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/encoding-japanese/-/encoding-japanese-2.2.0.tgz#0ef2d2351250547f432a2dd155453555c16deb59" + integrity sha512-EuJWwlHPZ1LbADuKTClvHtwbaFn4rOD+dRAbWysqEOXRc2Uui0hJInNJrsdH0c+OhJA4nrCBdSkW4DD5YxAo6A== + encoding-sniffer@^0.2.0: version "0.2.0" resolved "https://registry.yarnpkg.com/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz#799569d66d443babe82af18c9f403498365ef1d5" @@ -496,6 +501,11 @@ iconv-lite@0.6.3, iconv-lite@^0.6.3: dependencies: safer-buffer ">= 2.1.2 < 3.0.0" +iconv@3.0.1: + version "3.0.1" + resolved "https://registry.yarnpkg.com/iconv/-/iconv-3.0.1.tgz#31d319d71b59415cf348362c382961b1eb6f0e81" + integrity sha512-lJnFLxVc0d82R7GfU7a9RujKVUQ3Eee19tPKWZWBJtAEGRHVEyFzCtbNl3GPKuDnHBBRT4/nDS4Ru9AIDT72qA== + inflight@^1.0.4: version "1.0.6" resolved "https://registry.yarnpkg.com/inflight/-/inflight-1.0.6.tgz#49bd6331d7d02d0c09bc910a1075ba8165b56df9"