From 8fdd4bbffa539517e0578004157e8447c0fb9e90 Mon Sep 17 00:00:00 2001 From: Michele Riva Date: Thu, 27 Feb 2025 16:49:58 +0100 Subject: [PATCH] feat: improves chinese and japanese tokenizers (#899) --- packages/tokenizers/.tshy/build.json | 8 + packages/tokenizers/.tshy/commonjs.json | 16 + packages/tokenizers/.tshy/esm.json | 15 + packages/tokenizers/package.json | 50 +- packages/tokenizers/scripts/build.mjs | 68 -- packages/tokenizers/src/index.ts | 5 + packages/tokenizers/src/japanese.ts | 94 ++ packages/tokenizers/src/mandarin.ts | 94 ++ .../src/tokenizer-japanese/.gitignore | 2 - .../src/tokenizer-japanese/Cargo.lock | 826 ------------------ .../src/tokenizer-japanese/Cargo.toml | 14 - .../src/tokenizer-japanese/src/lib.rs | 38 - .../src/tokenizer-japanese/src/tokenizer.ts | 76 -- .../src/tokenizer-mandarin/.gitignore | 2 - .../src/tokenizer-mandarin/Cargo.lock | 329 ------- .../src/tokenizer-mandarin/Cargo.toml | 12 - .../src/tokenizer-mandarin/src/lib.rs | 11 - .../src/tokenizer-mandarin/src/tokenizer.ts | 76 -- .../{japanese.test.js => japanese.test.ts} | 21 +- .../{mandarin.test.js => mandarin.test.ts} | 16 +- packages/tokenizers/tsconfig.json | 18 + pnpm-lock.yaml | 8 +- 22 files changed, 302 insertions(+), 1497 deletions(-) create mode 100644 packages/tokenizers/.tshy/build.json create mode 100644 packages/tokenizers/.tshy/commonjs.json create mode 100644 packages/tokenizers/.tshy/esm.json delete mode 100644 packages/tokenizers/scripts/build.mjs create mode 100644 packages/tokenizers/src/index.ts create mode 100644 packages/tokenizers/src/japanese.ts create mode 100644 packages/tokenizers/src/mandarin.ts delete mode 100644 packages/tokenizers/src/tokenizer-japanese/.gitignore delete mode 100644 packages/tokenizers/src/tokenizer-japanese/Cargo.lock delete mode 100644 packages/tokenizers/src/tokenizer-japanese/Cargo.toml delete mode 100644 packages/tokenizers/src/tokenizer-japanese/src/lib.rs delete mode 100644 packages/tokenizers/src/tokenizer-japanese/src/tokenizer.ts delete mode 100644 packages/tokenizers/src/tokenizer-mandarin/.gitignore delete mode 100644 packages/tokenizers/src/tokenizer-mandarin/Cargo.lock delete mode 100644 packages/tokenizers/src/tokenizer-mandarin/Cargo.toml delete mode 100644 packages/tokenizers/src/tokenizer-mandarin/src/lib.rs delete mode 100644 packages/tokenizers/src/tokenizer-mandarin/src/tokenizer.ts rename packages/tokenizers/tests/{japanese.test.js => japanese.test.ts} (78%) rename packages/tokenizers/tests/{mandarin.test.js => mandarin.test.ts} (80%) create mode 100644 packages/tokenizers/tsconfig.json diff --git a/packages/tokenizers/.tshy/build.json b/packages/tokenizers/.tshy/build.json new file mode 100644 index 000000000..aea1a9e93 --- /dev/null +++ b/packages/tokenizers/.tshy/build.json @@ -0,0 +1,8 @@ +{ + "extends": "../tsconfig.json", + "compilerOptions": { + "rootDir": "../src", + "module": "nodenext", + "moduleResolution": "nodenext" + } +} diff --git a/packages/tokenizers/.tshy/commonjs.json b/packages/tokenizers/.tshy/commonjs.json new file mode 100644 index 000000000..7c9db50b6 --- /dev/null +++ b/packages/tokenizers/.tshy/commonjs.json @@ -0,0 +1,16 @@ +{ + "extends": "./build.json", + "include": [ + "../src/**/*.ts", + "../src/**/*.cts", + "../src/**/*.tsx", + "../src/**/*.json" + ], + "exclude": [ + "../src/**/*.mts", + "../src/package.json" + ], + "compilerOptions": { + "outDir": "../.tshy-build/commonjs" + } +} diff --git a/packages/tokenizers/.tshy/esm.json b/packages/tokenizers/.tshy/esm.json new file mode 100644 index 000000000..959294a84 --- /dev/null +++ b/packages/tokenizers/.tshy/esm.json @@ -0,0 +1,15 @@ +{ + "extends": "./build.json", + "include": [ + "../src/**/*.ts", + "../src/**/*.mts", + "../src/**/*.tsx", + "../src/**/*.json" + ], + "exclude": [ + "../src/package.json" + ], + "compilerOptions": { + "outDir": "../.tshy-build/esm" + } +} diff --git a/packages/tokenizers/package.json b/packages/tokenizers/package.json index 8ebe70379..64e3d20df 100644 --- a/packages/tokenizers/package.json +++ b/packages/tokenizers/package.json @@ -6,20 +6,33 @@ "sideEffects": false, "exports": { "./japanese": { - "types": "./build/tokenizer-japanese/tokenizer.d.ts", - "import": "./build/tokenizer-japanese/tokenizer.mjs", - "require": "./build/tokenizer-japanese/tokenizer.js" + "import": { + "types": "./dist/esm/japanese.d.ts", + "default": "./dist/esm/japanese.js" + }, + "require": { + "types": "./dist/commonjs/japanese.d.ts", + "default": "./dist/commonjs/japanese.js" + } }, "./mandarin": { - "types": "./build/tokenizer-mandarin/tokenizer.d.ts", - "import": "./build/tokenizer-mandarin/tokenizer.mjs", - "require": "./build/tokenizer-mandarin/tokenizer.js" - } + "import": { + "types": "./dist/esm/mandarin.d.ts", + "default": "./dist/esm/mandarin.js" + }, + "require": { + "types": "./dist/commonjs/mandarin.d.ts", + "default": "./dist/commonjs/mandarin.js" + } + }, + "./package.json": "./package.json" }, "dependencies": { "@orama/orama": "workspace:*" }, - "files": ["build"], + "files": [ + "build" + ], "repository": { "type": "git", "url": "https://github.com/oramasearch/orama" @@ -28,8 +41,8 @@ "url": "https://github.com/oramasearch/orama" }, "scripts": { - "build": "BUILD_TOKENIZERS=1 node ./scripts/build.mjs", - "test": "node ./tests/japanese.test.js && node ./tests/japanese.test.js" + "build": "tshy", + "test": "tsx ./tests/japanese.test.ts && tsx ./tests/mandarin.test.ts" }, "keywords": [ "full-text search", @@ -53,9 +66,20 @@ "engines": { "node": ">= 18.0.0" }, + "tshy": { + "dialects": [ + "esm", + "commonjs" + ], + "exports": { + "./japanese": "./src/japanese.ts", + "./mandarin": "./src/mandarin.ts", + "./package.json": "./package.json" + } + }, "devDependencies": { "tap": "^18.6.1", - "tsup": "^7.2.0", - "tsx": "^4.7.1" + "tshy": "^3.0.2", + "tsx": "^4.19.2" } -} +} \ No newline at end of file diff --git a/packages/tokenizers/scripts/build.mjs b/packages/tokenizers/scripts/build.mjs deleted file mode 100644 index f915d6156..000000000 --- a/packages/tokenizers/scripts/build.mjs +++ /dev/null @@ -1,68 +0,0 @@ -import path from 'node:path' -import fs from 'node:fs' -import childProcess from 'node:child_process' - -if (process.env.BUILD_TOKENIZERS !== '1') { - console.log('Skipping build for custom tokenizers.') - process.exit(0) -} - -const isWasmPackInstalled = await checkWasmPackInstalled() - -const languages = ['mandarin', 'japanese'] - -const outdirBaseURL = new URL('../build', import.meta.url).pathname - -if (fs.existsSync(outdirBaseURL)) { - fs.rmdirSync(outdirBaseURL, { recursive: true }) -} - -fs.mkdirSync(outdirBaseURL) - -for (const language of languages) { - if (!isWasmPackInstalled) { - console.warn('!! WARNING') - console.warn(`!! Compilation of the **${language}** tokenizer requires wasm-pack to be installed.`) - console.warn('!! No wasm-pack installation found. Skipping build.') - process.exit(0) - } - - const tokenizersBaseURL = new URL('../src', import.meta.url).pathname - - const tokenizerPath = path.join(tokenizersBaseURL, `tokenizer-${language}`) - const tokenizerWasmPath = path.join(tokenizerPath, 'pkg') - const tokenizerDistPath = path.join(tokenizersBaseURL, `../build/tokenizer-${language}`) - const tokenizerWrapperPath = path.join(tokenizersBaseURL, `tokenizer-${language}/src/tokenizer.ts`) - const tokenizerWrapperDistPath = path.join(tokenizerDistPath, 'tokenizer.ts') - - childProcess.execSync(`cd ${tokenizerPath} && wasm-pack build --target web`) - - fs.cpSync(tokenizerWrapperPath, tokenizerWrapperDistPath, { - recursive: true - }) - - fs.cpSync(tokenizerWasmPath, tokenizerDistPath, { - recursive: true - }) - - fs.rmSync(path.join(tokenizerDistPath, '.gitignore')) - - const r = fs.readFileSync(`./build/tokenizer-${language}/tokenizer_${language}_bg.wasm`) - const b = new Uint8Array(r) - const rr = `export const wasm = new Uint8Array([${b.join(',')}]);` - fs.writeFileSync(`./build/tokenizer-${language}/tokenizer_${language}_bg_wasm_arr.js`, rr) - - childProcess.execSync(`cd ${tokenizerDistPath} && npx tsup --format cjs,esm,iife --outDir . tokenizer.ts`) -} - -async function checkWasmPackInstalled() { - return new Promise((resolve) => { - childProcess.exec('wasm-pack --version', (err) => { - if (err) { - resolve(false) - } else { - resolve(true) - } - }) - }) -} diff --git a/packages/tokenizers/src/index.ts b/packages/tokenizers/src/index.ts new file mode 100644 index 000000000..d95373acf --- /dev/null +++ b/packages/tokenizers/src/index.ts @@ -0,0 +1,5 @@ +import { createTokenizer as createJapaneseTokenizer } from "./japanese.js"; + +export default { + japanese: createJapaneseTokenizer, +} \ No newline at end of file diff --git a/packages/tokenizers/src/japanese.ts b/packages/tokenizers/src/japanese.ts new file mode 100644 index 000000000..bdcb83ec8 --- /dev/null +++ b/packages/tokenizers/src/japanese.ts @@ -0,0 +1,94 @@ +import type { DefaultTokenizer, DefaultTokenizerConfig } from "@orama/orama"; +import { normalizeToken } from "@orama/orama/internals"; + +const tokenizerLanguage = "japanese"; + +type TLanguage = typeof tokenizerLanguage; + +type JapaneseTokenizerConfig = DefaultTokenizerConfig & { + language: TLanguage; +}; + +const defaultConfig: JapaneseTokenizerConfig = { + language: tokenizerLanguage, +}; + +const segmenter = new Intl.Segmenter("ja", { granularity: "word" }); + +/* c8 ignore next 10 */ +function trim(text: string[]): string[] { + while (text[text.length - 1] === "") { + text.pop(); + } + while (text[0] === "") { + text.shift(); + } + return text; +} + +function tokenize(text: string): string[] { + const segments = segmenter.segment(text); + + const tokens: string[] = []; + for (const segment of segments) { + if (segment.isWordLike) { + tokens.push(segment.segment); + } + } + + return tokens; +} + +function tokenizeInternal( + this: DefaultTokenizer, + input: string, + language?: TLanguage, + prop?: string, +): string[] { + /* c8 ignore next 3 */ + if (typeof input !== "string") { + return [input]; + } + + let tokens: string[]; + if (prop && this?.tokenizeSkipProperties?.has(prop)) { + // @ts-ignore + tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)]; + } else { + tokens = tokenize(input); + } + + const trimTokens = trim(tokens); + + if (!this.allowDuplicates) { + return Array.from(new Set(trimTokens)); + } + + return trimTokens; +} + +export function createTokenizer( + config: JapaneseTokenizerConfig = defaultConfig, +): DefaultTokenizer { + const tokenizerConfig = { + tokenize: tokenizeInternal, + language: config.language, + stemmerSkipProperties: new Set( + config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : [], + ), + tokenizeSkipProperties: new Set( + config.tokenizeSkipProperties + ? [config.tokenizeSkipProperties].flat() + : [], + ), + stopWords: config.stopWords as string[] | undefined, + allowDuplicates: Boolean(config.allowDuplicates), + normalizeToken, + normalizationCache: new Map(), + }; + + // @ts-ignore + tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal); + + return tokenizerConfig; +} diff --git a/packages/tokenizers/src/mandarin.ts b/packages/tokenizers/src/mandarin.ts new file mode 100644 index 000000000..c8e3aa8ca --- /dev/null +++ b/packages/tokenizers/src/mandarin.ts @@ -0,0 +1,94 @@ +import type { DefaultTokenizer, DefaultTokenizerConfig } from "@orama/orama"; +import { normalizeToken } from "@orama/orama/internals"; + +const tokenizerLanguage = "japanese"; + +type TLanguage = typeof tokenizerLanguage; + +type JapaneseTokenizerConfig = DefaultTokenizerConfig & { + language: TLanguage; +}; + +const defaultConfig: JapaneseTokenizerConfig = { + language: tokenizerLanguage, +}; + +const segmenter = new Intl.Segmenter("zh-CN", { granularity: "word" }); + +/* c8 ignore next 10 */ +function trim(text: string[]): string[] { + while (text[text.length - 1] === "") { + text.pop(); + } + while (text[0] === "") { + text.shift(); + } + return text; +} + +function tokenize(text: string): string[] { + const segments = segmenter.segment(text); + + const tokens: string[] = []; + for (const segment of segments) { + if (segment.isWordLike) { + tokens.push(segment.segment); + } + } + + return tokens; +} + +function tokenizeInternal( + this: DefaultTokenizer, + input: string, + language?: TLanguage, + prop?: string, +): string[] { + /* c8 ignore next 3 */ + if (typeof input !== "string") { + return [input]; + } + + let tokens: string[]; + if (prop && this?.tokenizeSkipProperties?.has(prop)) { + // @ts-ignore + tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)]; + } else { + tokens = tokenize(input); + } + + const trimTokens = trim(tokens); + + if (!this.allowDuplicates) { + return Array.from(new Set(trimTokens)); + } + + return trimTokens; +} + +export function createTokenizer( + config: JapaneseTokenizerConfig = defaultConfig, +): DefaultTokenizer { + const tokenizerConfig = { + tokenize: tokenizeInternal, + language: config.language, + stemmerSkipProperties: new Set( + config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : [], + ), + tokenizeSkipProperties: new Set( + config.tokenizeSkipProperties + ? [config.tokenizeSkipProperties].flat() + : [], + ), + stopWords: config.stopWords as string[] | undefined, + allowDuplicates: Boolean(config.allowDuplicates), + normalizeToken, + normalizationCache: new Map(), + }; + + // @ts-ignore + tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal); + + return tokenizerConfig; +} diff --git a/packages/tokenizers/src/tokenizer-japanese/.gitignore b/packages/tokenizers/src/tokenizer-japanese/.gitignore deleted file mode 100644 index 4dbfc37a1..000000000 --- a/packages/tokenizers/src/tokenizer-japanese/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -target -pkg diff --git a/packages/tokenizers/src/tokenizer-japanese/Cargo.lock b/packages/tokenizers/src/tokenizer-japanese/Cargo.lock deleted file mode 100644 index 84f3d7d00..000000000 --- a/packages/tokenizers/src/tokenizer-japanese/Cargo.lock +++ /dev/null @@ -1,826 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "aho-corasick" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" -dependencies = [ - "memchr", -] - -[[package]] -name = "anstream" -version = "0.6.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" -dependencies = [ - "anstyle", - "anstyle-parse", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "utf8parse", -] - -[[package]] -name = "anstyle" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" - -[[package]] -name = "anstyle-parse" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" -dependencies = [ - "utf8parse", -] - -[[package]] -name = "anstyle-query" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" -dependencies = [ - "windows-sys", -] - -[[package]] -name = "anstyle-wincon" -version = "3.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" -dependencies = [ - "anstyle", - "windows-sys", -] - -[[package]] -name = "anyhow" -version = "1.0.80" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" - -[[package]] -name = "bumpalo" -version = "3.15.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "colorchoice" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" - -[[package]] -name = "crc32fast" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "csv" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" -dependencies = [ - "memchr", -] - -[[package]] -name = "encoding" -version = "0.2.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -dependencies = [ - "encoding-index-japanese", - "encoding-index-korean", - "encoding-index-simpchinese", - "encoding-index-singlebyte", - "encoding-index-tradchinese", -] - -[[package]] -name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-korean" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-singlebyte" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-tradchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding_index_tests" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" - -[[package]] -name = "encoding_rs" -version = "0.8.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "encoding_rs_io" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" -dependencies = [ - "encoding_rs", -] - -[[package]] -name = "env_filter" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" -dependencies = [ - "log", - "regex", -] - -[[package]] -name = "env_logger" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" -dependencies = [ - "anstream", - "anstyle", - "env_filter", - "humantime", - "log", -] - -[[package]] -name = "errno" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" -dependencies = [ - "libc", - "windows-sys", -] - -[[package]] -name = "filetime" -version = "0.2.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "windows-sys", -] - -[[package]] -name = "flate2" -version = "1.0.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" -dependencies = [ - "crc32fast", - "miniz_oxide", -] - -[[package]] -name = "glob" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" - -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - -[[package]] -name = "itoa" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" - -[[package]] -name = "libc" -version = "0.2.153" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" - -[[package]] -name = "lindera-cc-cedict-builder" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca21f2ee3ca40e7f3ebbd568d041be1531c2c28dbf540e737aeba934ab53f330" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-core" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d4b717a8a31b73a3cbd3552e0abda14e0c85d97dc8b911035342533defdbad" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "encoding_rs", - "log", - "once_cell", - "serde", - "thiserror", - "yada", -] - -[[package]] -name = "lindera-decompress" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98f4476c99cb4ffa54fbfc42953adf69ada7276cfbb594bce9829547de012058" -dependencies = [ - "anyhow", - "flate2", - "serde", -] - -[[package]] -name = "lindera-dictionary" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45b92f0ce331c2202c6cec3135e4bfce29525ab3bb97a613c27c8e0a29fa967" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "lindera-cc-cedict-builder", - "lindera-core", - "lindera-ipadic", - "lindera-ipadic-builder", - "lindera-ipadic-neologd-builder", - "lindera-ko-dic-builder", - "lindera-unidic-builder", - "serde", -] - -[[package]] -name = "lindera-ipadic" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ed621a83853d989a8ff806a4583d0f39e3fec3bf6e5905b1d1fc1a4f342a190" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ipadic-builder", - "once_cell", - "tar", -] - -[[package]] -name = "lindera-ipadic-builder" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "642dee52201852df209cb43423ff1ca4d161a329f5cdba049a7b5820118345f2" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ipadic-neologd-builder" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325144b154e68159373e944d1cd7f67c6ff9965a2af41240a8e41732b3fdb3af" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ko-dic-builder" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9413d4d9bf7af921f5ac64414a290c7ba81695e8ba08dd2f6c950b57c281a69" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-tokenizer" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9987c818462d51ca67e131e40f0386e25e8c557e195059b1257f95731561185d" -dependencies = [ - "bincode", - "lindera-core", - "lindera-dictionary", - "once_cell", - "serde", - "serde_json", -] - -[[package]] -name = "lindera-unidic-builder" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601ec33b5174141396a7a4ca066278863840221fec32d0be19091e7fae91ed94" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "linux-raw-sys" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" - -[[package]] -name = "log" -version = "0.4.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" - -[[package]] -name = "memchr" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" - -[[package]] -name = "miniz_oxide" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" -dependencies = [ - "adler", -] - -[[package]] -name = "once_cell" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" - -[[package]] -name = "proc-macro2" -version = "1.0.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quote" -version = "1.0.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - -[[package]] -name = "regex" -version = "1.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" - -[[package]] -name = "rustix" -version = "0.38.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" -dependencies = [ - "bitflags 2.4.2", - "errno", - "libc", - "linux-raw-sys", - "windows-sys", -] - -[[package]] -name = "ryu" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" - -[[package]] -name = "serde" -version = "1.0.197" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.197" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.114" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "syn" -version = "2.0.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "tar" -version = "0.4.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb" -dependencies = [ - "filetime", - "libc", - "xattr", -] - -[[package]] -name = "thiserror" -version = "1.0.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tokenizer-japanese" -version = "0.1.0" -dependencies = [ - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", - "serde_json", - "wasm-bindgen", -] - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "utf8parse" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" - -[[package]] -name = "wasm-bindgen" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1e124130aee3fb58c5bdd6b639a0509486b0338acaaae0c84a5124b0f588b7f" -dependencies = [ - "cfg-if", - "serde", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e7e1900c352b609c8488ad12639a311045f40a35491fb69ba8c12f758af70b" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b30af9e2d358182b5c7449424f017eba305ed32a7010509ede96cdc4696c46ed" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f186bd2dcf04330886ce82d6f33dd75a7bfcf69ecf5763b89fcde53b6ac9838" - -[[package]] -name = "windows-sys" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" -dependencies = [ - "windows-targets", -] - -[[package]] -name = "windows-targets" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" -dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" - -[[package]] -name = "windows_i686_gnu" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" - -[[package]] -name = "xattr" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f" -dependencies = [ - "libc", - "linux-raw-sys", - "rustix", -] - -[[package]] -name = "yada" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" diff --git a/packages/tokenizers/src/tokenizer-japanese/Cargo.toml b/packages/tokenizers/src/tokenizer-japanese/Cargo.toml deleted file mode 100644 index e81ede927..000000000 --- a/packages/tokenizers/src/tokenizer-japanese/Cargo.toml +++ /dev/null @@ -1,14 +0,0 @@ -[package] -name = "tokenizer-japanese" -version = "0.1.0" -edition = "2021" - -[dependencies] -wasm-bindgen = { version = "0.2.91", features = ["serde"] } -serde_json = "1.0.113" -lindera-tokenizer = { version = "0.28.0", features = ["ipadic"] } -lindera-core = "0.28.0" -lindera-dictionary = "0.28.0" - -[lib] -crate-type = ["cdylib", "rlib"] diff --git a/packages/tokenizers/src/tokenizer-japanese/src/lib.rs b/packages/tokenizers/src/tokenizer-japanese/src/lib.rs deleted file mode 100644 index 89073ea30..000000000 --- a/packages/tokenizers/src/tokenizer-japanese/src/lib.rs +++ /dev/null @@ -1,38 +0,0 @@ -use lindera_core::{mode::Mode}; -use lindera_dictionary::{DictionaryConfig, DictionaryKind}; -use lindera_tokenizer::tokenizer::{Tokenizer, TokenizerConfig}; -use wasm_bindgen::prelude::*; - -#[wasm_bindgen] -pub fn tokenize(text: &str) -> JsValue { - let dictionary = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - - let config = TokenizerConfig { - dictionary, - user_dictionary: None, - mode: Mode::Normal, - }; - - let tokenizer = match Tokenizer::from_config(config) { - Ok(t) => t, - Err(e) => { - let error_message = format!("Failed to create tokenizer: {}", e); - wasm_bindgen::throw_val(JsValue::from_str(&error_message)) - } - }; - - let tokens = match tokenizer.tokenize(text) { - Ok(t) => t, - Err(e) => { - let error_message = format!("Failed to create tokenizer: {}", e); - wasm_bindgen::throw_val(JsValue::from_str(&error_message)) - } - }; - - let tokens_str = serde_json::to_string(&tokens).unwrap_or_else(|_| String::from("[]")); - - JsValue::from_str(&tokens_str) -} diff --git a/packages/tokenizers/src/tokenizer-japanese/src/tokenizer.ts b/packages/tokenizers/src/tokenizer-japanese/src/tokenizer.ts deleted file mode 100644 index 2cf8bb46c..000000000 --- a/packages/tokenizers/src/tokenizer-japanese/src/tokenizer.ts +++ /dev/null @@ -1,76 +0,0 @@ -import type { DefaultTokenizerConfig, DefaultTokenizer } from '@orama/orama' -import { normalizeToken } from '@orama/orama/internals' -// @ts-expect-error - this file is gonna be moved inside the `pkg` folder at build time -import init, { tokenize } from './tokenizer_japanese.js' -// @ts-expect-error - this file is gonna be created at build time -import { wasm } from './tokenizer_japanese_bg_wasm_arr.js' - -const tokenizerLanguage = 'japanese' - -type TLanguage = typeof tokenizerLanguage - -type JapaneseTokenizerConfig = DefaultTokenizerConfig & { - language: TLanguage -} - -const defaultConfig: JapaneseTokenizerConfig = { - language: tokenizerLanguage -} - -/* c8 ignore next 10 */ -function trim(text: string[]): string[] { - while (text[text.length - 1] === '') { - text.pop() - } - while (text[0] === '') { - text.shift() - } - return text -} - -async function tokenizeInternal( - this: DefaultTokenizer, - input: string, - language?: TLanguage, - prop?: string -): Promise { - /* c8 ignore next 3 */ - if (typeof input !== 'string') { - return [input] - } - - let tokens: string[] - if (prop && this?.tokenizeSkipProperties?.has(prop)) { - tokens = [this?.normalizeToken?.bind(this, prop ?? '')(input)] - } else { - tokens = await tokenize(input, true) - } - - const trimTokens = trim(tokens) - - if (!this.allowDuplicates) { - return Array.from(new Set(trimTokens)) - } - - return trimTokens -} - -export async function createTokenizer(config: JapaneseTokenizerConfig = defaultConfig): Promise { - await init(wasm) - - const tokenizerConfig = { - tokenize: tokenizeInternal, - language: config.language, - stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : []), - tokenizeSkipProperties: new Set(config.tokenizeSkipProperties ? [config.tokenizeSkipProperties].flat() : []), - stopWords: config.stopWords as string[] | undefined, - allowDuplicates: Boolean(config.allowDuplicates), - normalizeToken, - normalizationCache: new Map() - } - - tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal) - - // @ts-expect-error - here we are forcing "japanese" as a language - return tokenizerConfig -} diff --git a/packages/tokenizers/src/tokenizer-mandarin/.gitignore b/packages/tokenizers/src/tokenizer-mandarin/.gitignore deleted file mode 100644 index 4dbfc37a1..000000000 --- a/packages/tokenizers/src/tokenizer-mandarin/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -target -pkg diff --git a/packages/tokenizers/src/tokenizer-mandarin/Cargo.lock b/packages/tokenizers/src/tokenizer-mandarin/Cargo.lock deleted file mode 100644 index e1d5f7914..000000000 --- a/packages/tokenizers/src/tokenizer-mandarin/Cargo.lock +++ /dev/null @@ -1,329 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "aho-corasick" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" -dependencies = [ - "memchr", -] - -[[package]] -name = "bumpalo" -version = "3.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "cedarwood" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" -dependencies = [ - "smallvec", -] - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] -name = "hashbrown" -version = "0.14.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" - -[[package]] -name = "itoa" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" - -[[package]] -name = "jieba-rs" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f0c1347cd3ac8d7c6e3a2dc33ac496d365cf09fc0831aa61111e1a6738983e" -dependencies = [ - "cedarwood", - "fxhash", - "hashbrown", - "lazy_static", - "phf", - "phf_codegen", - "regex", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "log" -version = "0.4.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" - -[[package]] -name = "memchr" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" - -[[package]] -name = "once_cell" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" - -[[package]] -name = "phf" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_codegen" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" -dependencies = [ - "phf_shared", - "rand", -] - -[[package]] -name = "phf_shared" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" -dependencies = [ - "siphasher", -] - -[[package]] -name = "proc-macro2" -version = "1.0.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quote" -version = "1.0.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" - -[[package]] -name = "regex" -version = "1.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" - -[[package]] -name = "ryu" -version = "1.0.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" - -[[package]] -name = "serde" -version = "1.0.196" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.196" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.113" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69801b70b1c3dac963ecb03a364ba0ceda9cf60c71cfe475e99864759c8b8a79" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "siphasher" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" - -[[package]] -name = "smallvec" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" - -[[package]] -name = "syn" -version = "2.0.48" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "tokenizer-mandarin" -version = "0.1.0" -dependencies = [ - "jieba-rs", - "serde_json", - "wasm-bindgen", -] - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "wasm-bindgen" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1e124130aee3fb58c5bdd6b639a0509486b0338acaaae0c84a5124b0f588b7f" -dependencies = [ - "cfg-if", - "serde", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e7e1900c352b609c8488ad12639a311045f40a35491fb69ba8c12f758af70b" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b30af9e2d358182b5c7449424f017eba305ed32a7010509ede96cdc4696c46ed" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f186bd2dcf04330886ce82d6f33dd75a7bfcf69ecf5763b89fcde53b6ac9838" diff --git a/packages/tokenizers/src/tokenizer-mandarin/Cargo.toml b/packages/tokenizers/src/tokenizer-mandarin/Cargo.toml deleted file mode 100644 index 7ab042a0f..000000000 --- a/packages/tokenizers/src/tokenizer-mandarin/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -name = "tokenizer-mandarin" -version = "0.1.0" -edition = "2021" - -[dependencies] -wasm-bindgen = { version = "0.2.91", features = ["serde"] } -serde_json = "1.0.113" -jieba-rs = "0.6.8" - -[lib] -crate-type = ["cdylib", "rlib"] diff --git a/packages/tokenizers/src/tokenizer-mandarin/src/lib.rs b/packages/tokenizers/src/tokenizer-mandarin/src/lib.rs deleted file mode 100644 index 4317e4510..000000000 --- a/packages/tokenizers/src/tokenizer-mandarin/src/lib.rs +++ /dev/null @@ -1,11 +0,0 @@ -use jieba_rs::Jieba; -use serde_json::to_string; -use wasm_bindgen::prelude::*; - -#[wasm_bindgen] -pub fn tokenize(sentence: &str, hmm: bool) -> JsValue { - let jieba = Jieba::new(); - let words = jieba.cut(sentence, hmm); - let serialized_words = to_string(&words).unwrap_or_default(); - JsValue::from_str(&serialized_words) -} diff --git a/packages/tokenizers/src/tokenizer-mandarin/src/tokenizer.ts b/packages/tokenizers/src/tokenizer-mandarin/src/tokenizer.ts deleted file mode 100644 index 7934bc3e0..000000000 --- a/packages/tokenizers/src/tokenizer-mandarin/src/tokenizer.ts +++ /dev/null @@ -1,76 +0,0 @@ -import type { DefaultTokenizerConfig, DefaultTokenizer } from '@orama/orama' -import { normalizeToken } from '@orama/orama/internals' -// @ts-expect-error - this file is gonna be moved inside the `pkg` folder at build time -import init, { tokenize } from './tokenizer_mandarin.js' -// @ts-expect-error - this file is gonna be created at build time -import { wasm } from './tokenizer_mandarin_bg_wasm_arr.js' - -const tokenizerLanguage = 'mandarin' - -type TLanguage = typeof tokenizerLanguage - -type MandarinTokenizerConfig = DefaultTokenizerConfig & { - language: TLanguage -} - -const defaultConfig: MandarinTokenizerConfig = { - language: tokenizerLanguage -} - -/* c8 ignore next 10 */ -function trim(text: string[]): string[] { - while (text[text.length - 1] === '') { - text.pop() - } - while (text[0] === '') { - text.shift() - } - return text -} - -async function tokenizeInternal( - this: DefaultTokenizer, - input: string, - language?: TLanguage, - prop?: string -): Promise { - /* c8 ignore next 3 */ - if (typeof input !== 'string') { - return [input] - } - - let tokens: string[] - if (prop && this?.tokenizeSkipProperties?.has(prop)) { - tokens = [this?.normalizeToken?.bind(this, prop ?? '')(input)] - } else { - tokens = await tokenize(input, true) - } - - const trimTokens = trim(tokens) - - if (!this.allowDuplicates) { - return Array.from(new Set(trimTokens)) - } - - return trimTokens -} - -export async function createTokenizer(config: MandarinTokenizerConfig = defaultConfig): Promise { - await init(wasm) - - const tokenizerConfig = { - tokenize: tokenizeInternal, - language: config.language, - stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : []), - tokenizeSkipProperties: new Set(config.tokenizeSkipProperties ? [config.tokenizeSkipProperties].flat() : []), - stopWords: config.stopWords as string[] | undefined, - allowDuplicates: Boolean(config.allowDuplicates), - normalizeToken, - normalizationCache: new Map() - } - - tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal) - - // @ts-expect-error - here we are forcing "mandarin" as a language - return tokenizerConfig -} diff --git a/packages/tokenizers/tests/japanese.test.js b/packages/tokenizers/tests/japanese.test.ts similarity index 78% rename from packages/tokenizers/tests/japanese.test.js rename to packages/tokenizers/tests/japanese.test.ts index 23bb20baa..559bdb3f4 100644 --- a/packages/tokenizers/tests/japanese.test.js +++ b/packages/tokenizers/tests/japanese.test.ts @@ -1,30 +1,23 @@ -import fs from 'fs' import t from 'tap' -import { create, insert, search } from '@orama/orama' +import { create, insert, Results, search } from '@orama/orama' +import { createTokenizer } from '../src/japanese.js' -if (!fs.existsSync('build/tokenizer-japanese/tokenizer.js') && process.env.TEST_TOKENIZERS !== '1') { - // Still experimental. @todo: remove this check - console.log(`Skipping Japanese tokenizer tests`) - process.exit(0) -} - -const { createTokenizer } = await import('../build/tokenizer-japanese/tokenizer.js') - -const db = await create({ +const db = create({ schema: { name: 'string' }, components: { - tokenizer: await createTokenizer() + tokenizer: createTokenizer() } }) +// @ts-ignore function getHitsNames(hits) { + // @ts-ignore return hits.map((hit) => hit.document.name) } -// Temporary skip. Initializing a WASM package in CI is slowing down everything and we need to find a better way to handle this. -t.skip('Japanese tokenizer', async (t) => { +t.test('Japanese tokenizer', async (t) => { await insert(db, { name: '東京' }) // Tokyo await insert(db, { name: '大阪' }) // Osaka await insert(db, { name: '京都' }) // Kyoto diff --git a/packages/tokenizers/tests/mandarin.test.js b/packages/tokenizers/tests/mandarin.test.ts similarity index 80% rename from packages/tokenizers/tests/mandarin.test.js rename to packages/tokenizers/tests/mandarin.test.ts index f35afe282..79ae6300c 100644 --- a/packages/tokenizers/tests/mandarin.test.js +++ b/packages/tokenizers/tests/mandarin.test.ts @@ -1,21 +1,14 @@ import fs from 'fs' import t from 'tap' import { create, insert, search } from '@orama/orama' +import { createTokenizer } from '../src/mandarin.js' -if (!fs.existsSync('build/tokenizer-mandarin/tokenizer.js') && process.env.TEST_TOKENIZERS !== '1') { - // Still experimental. @todo: remove this check - console.log(`Skipping Mandarin tokenizer tests`) - process.exit(0) -} - -const { createTokenizer } = await import('../build/tokenizer-mandarin/tokenizer.js') - -const db = await create({ +const db = create({ schema: { name: 'string' }, components: { - tokenizer: await createTokenizer() + tokenizer: createTokenizer() } }) @@ -23,8 +16,7 @@ function getHitsNames(hits) { return hits.map((hit) => hit.document.name) } -// Temporary skip. Initializing a WASM package in CI is slowing down everything and we need to find a better way to handle this. -t.skip('Mandarin tokenizer', async (t) => { +t.test('Mandarin tokenizer', async (t) => { await insert(db, { name: '北京' }) // Beijing await insert(db, { name: '上海' }) // Shanghai await insert(db, { name: '广州' }) // Guangzhou diff --git a/packages/tokenizers/tsconfig.json b/packages/tokenizers/tsconfig.json new file mode 100644 index 000000000..648a72cca --- /dev/null +++ b/packages/tokenizers/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "declaration": true, + "declarationMap": true, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "inlineSources": true, + "jsx": "react", + "module": "nodenext", + "moduleResolution": "nodenext", + "noUncheckedIndexedAccess": true, + "resolveJsonModule": true, + "skipLibCheck": true, + "sourceMap": true, + "strict": false, + "target": "es2022" + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0e6c169d5..2f2ce2a0c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -834,11 +834,11 @@ importers: tap: specifier: ^18.6.1 version: 18.8.0(@swc/core@1.10.9)(@types/node@20.17.14)(react-dom@18.3.1)(react@18.3.1)(typescript@5.7.3) - tsup: - specifier: ^7.2.0 - version: 7.2.0(@swc/core@1.10.9)(typescript@5.7.3) + tshy: + specifier: ^3.0.2 + version: 3.0.2 tsx: - specifier: ^4.7.1 + specifier: ^4.19.2 version: 4.19.2 sandboxes/plugin-docusaurus-v2.4.3-sandbox: