Skip to content

Commit

Permalink
feat: improves chinese and japanese tokenizers (#899)
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva authored Feb 27, 2025
1 parent 22cf6a2 commit 8fdd4bb
Show file tree
Hide file tree
Showing 22 changed files with 302 additions and 1,497 deletions.
8 changes: 8 additions & 0 deletions packages/tokenizers/.tshy/build.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"extends": "../tsconfig.json",
"compilerOptions": {
"rootDir": "../src",
"module": "nodenext",
"moduleResolution": "nodenext"
}
}
16 changes: 16 additions & 0 deletions packages/tokenizers/.tshy/commonjs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"extends": "./build.json",
"include": [
"../src/**/*.ts",
"../src/**/*.cts",
"../src/**/*.tsx",
"../src/**/*.json"
],
"exclude": [
"../src/**/*.mts",
"../src/package.json"
],
"compilerOptions": {
"outDir": "../.tshy-build/commonjs"
}
}
15 changes: 15 additions & 0 deletions packages/tokenizers/.tshy/esm.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"extends": "./build.json",
"include": [
"../src/**/*.ts",
"../src/**/*.mts",
"../src/**/*.tsx",
"../src/**/*.json"
],
"exclude": [
"../src/package.json"
],
"compilerOptions": {
"outDir": "../.tshy-build/esm"
}
}
50 changes: 37 additions & 13 deletions packages/tokenizers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,33 @@
"sideEffects": false,
"exports": {
"./japanese": {
"types": "./build/tokenizer-japanese/tokenizer.d.ts",
"import": "./build/tokenizer-japanese/tokenizer.mjs",
"require": "./build/tokenizer-japanese/tokenizer.js"
"import": {
"types": "./dist/esm/japanese.d.ts",
"default": "./dist/esm/japanese.js"
},
"require": {
"types": "./dist/commonjs/japanese.d.ts",
"default": "./dist/commonjs/japanese.js"
}
},
"./mandarin": {
"types": "./build/tokenizer-mandarin/tokenizer.d.ts",
"import": "./build/tokenizer-mandarin/tokenizer.mjs",
"require": "./build/tokenizer-mandarin/tokenizer.js"
}
"import": {
"types": "./dist/esm/mandarin.d.ts",
"default": "./dist/esm/mandarin.js"
},
"require": {
"types": "./dist/commonjs/mandarin.d.ts",
"default": "./dist/commonjs/mandarin.js"
}
},
"./package.json": "./package.json"
},
"dependencies": {
"@orama/orama": "workspace:*"
},
"files": ["build"],
"files": [
"build"
],
"repository": {
"type": "git",
"url": "https://github.com/oramasearch/orama"
Expand All @@ -28,8 +41,8 @@
"url": "https://github.com/oramasearch/orama"
},
"scripts": {
"build": "BUILD_TOKENIZERS=1 node ./scripts/build.mjs",
"test": "node ./tests/japanese.test.js && node ./tests/japanese.test.js"
"build": "tshy",
"test": "tsx ./tests/japanese.test.ts && tsx ./tests/mandarin.test.ts"
},
"keywords": [
"full-text search",
Expand All @@ -53,9 +66,20 @@
"engines": {
"node": ">= 18.0.0"
},
"tshy": {
"dialects": [
"esm",
"commonjs"
],
"exports": {
"./japanese": "./src/japanese.ts",
"./mandarin": "./src/mandarin.ts",
"./package.json": "./package.json"
}
},
"devDependencies": {
"tap": "^18.6.1",
"tsup": "^7.2.0",
"tsx": "^4.7.1"
"tshy": "^3.0.2",
"tsx": "^4.19.2"
}
}
}
68 changes: 0 additions & 68 deletions packages/tokenizers/scripts/build.mjs

This file was deleted.

5 changes: 5 additions & 0 deletions packages/tokenizers/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import { createTokenizer as createJapaneseTokenizer } from "./japanese.js";

export default {
japanese: createJapaneseTokenizer,
}
94 changes: 94 additions & 0 deletions packages/tokenizers/src/japanese.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import type { DefaultTokenizer, DefaultTokenizerConfig } from "@orama/orama";
import { normalizeToken } from "@orama/orama/internals";

const tokenizerLanguage = "japanese";

type TLanguage = typeof tokenizerLanguage;

type JapaneseTokenizerConfig = DefaultTokenizerConfig & {
language: TLanguage;
};

const defaultConfig: JapaneseTokenizerConfig = {
language: tokenizerLanguage,
};

const segmenter = new Intl.Segmenter("ja", { granularity: "word" });

/* c8 ignore next 10 */
function trim(text: string[]): string[] {
while (text[text.length - 1] === "") {
text.pop();
}
while (text[0] === "") {
text.shift();
}
return text;
}

function tokenize(text: string): string[] {
const segments = segmenter.segment(text);

const tokens: string[] = [];
for (const segment of segments) {
if (segment.isWordLike) {
tokens.push(segment.segment);
}
}

return tokens;
}

function tokenizeInternal(
this: DefaultTokenizer,
input: string,
language?: TLanguage,
prop?: string,
): string[] {
/* c8 ignore next 3 */
if (typeof input !== "string") {
return [input];
}

let tokens: string[];
if (prop && this?.tokenizeSkipProperties?.has(prop)) {
// @ts-ignore
tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)];
} else {
tokens = tokenize(input);
}

const trimTokens = trim(tokens);

if (!this.allowDuplicates) {
return Array.from(new Set(trimTokens));
}

return trimTokens;
}

export function createTokenizer(
config: JapaneseTokenizerConfig = defaultConfig,
): DefaultTokenizer {
const tokenizerConfig = {
tokenize: tokenizeInternal,
language: config.language,
stemmerSkipProperties: new Set(
config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : [],
),
tokenizeSkipProperties: new Set(
config.tokenizeSkipProperties
? [config.tokenizeSkipProperties].flat()
: [],
),
stopWords: config.stopWords as string[] | undefined,
allowDuplicates: Boolean(config.allowDuplicates),
normalizeToken,
normalizationCache: new Map(),
};

// @ts-ignore
tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal);

return tokenizerConfig;
}
94 changes: 94 additions & 0 deletions packages/tokenizers/src/mandarin.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import type { DefaultTokenizer, DefaultTokenizerConfig } from "@orama/orama";
import { normalizeToken } from "@orama/orama/internals";

const tokenizerLanguage = "japanese";

type TLanguage = typeof tokenizerLanguage;

type JapaneseTokenizerConfig = DefaultTokenizerConfig & {
language: TLanguage;
};

const defaultConfig: JapaneseTokenizerConfig = {
language: tokenizerLanguage,
};

const segmenter = new Intl.Segmenter("zh-CN", { granularity: "word" });

/* c8 ignore next 10 */
function trim(text: string[]): string[] {
while (text[text.length - 1] === "") {
text.pop();
}
while (text[0] === "") {
text.shift();
}
return text;
}

function tokenize(text: string): string[] {
const segments = segmenter.segment(text);

const tokens: string[] = [];
for (const segment of segments) {
if (segment.isWordLike) {
tokens.push(segment.segment);
}
}

return tokens;
}

function tokenizeInternal(
this: DefaultTokenizer,
input: string,
language?: TLanguage,
prop?: string,
): string[] {
/* c8 ignore next 3 */
if (typeof input !== "string") {
return [input];
}

let tokens: string[];
if (prop && this?.tokenizeSkipProperties?.has(prop)) {
// @ts-ignore
tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)];
} else {
tokens = tokenize(input);
}

const trimTokens = trim(tokens);

if (!this.allowDuplicates) {
return Array.from(new Set(trimTokens));
}

return trimTokens;
}

export function createTokenizer(
config: JapaneseTokenizerConfig = defaultConfig,
): DefaultTokenizer {
const tokenizerConfig = {
tokenize: tokenizeInternal,
language: config.language,
stemmerSkipProperties: new Set(
config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : [],
),
tokenizeSkipProperties: new Set(
config.tokenizeSkipProperties
? [config.tokenizeSkipProperties].flat()
: [],
),
stopWords: config.stopWords as string[] | undefined,
allowDuplicates: Boolean(config.allowDuplicates),
normalizeToken,
normalizationCache: new Map(),
};

// @ts-ignore
tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal);

return tokenizerConfig;
}
2 changes: 0 additions & 2 deletions packages/tokenizers/src/tokenizer-japanese/.gitignore

This file was deleted.

Loading

0 comments on commit 8fdd4bb

Please sign in to comment.