-
-
Notifications
You must be signed in to change notification settings - Fork 326
/
Copy pathjapanese.ts
94 lines (77 loc) · 2.17 KB
/
japanese.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import type { DefaultTokenizer, DefaultTokenizerConfig } from "@orama/orama";
import { normalizeToken } from "@orama/orama/internals";
const tokenizerLanguage = "japanese";
type TLanguage = typeof tokenizerLanguage;
type JapaneseTokenizerConfig = DefaultTokenizerConfig & {
language: TLanguage;
};
const defaultConfig: JapaneseTokenizerConfig = {
language: tokenizerLanguage,
};
const segmenter = new Intl.Segmenter("ja", { granularity: "word" });
/* c8 ignore next 10 */
function trim(text: string[]): string[] {
while (text[text.length - 1] === "") {
text.pop();
}
while (text[0] === "") {
text.shift();
}
return text;
}
function tokenize(text: string): string[] {
const segments = segmenter.segment(text);
const tokens: string[] = [];
for (const segment of segments) {
if (segment.isWordLike) {
tokens.push(segment.segment);
}
}
return tokens;
}
function tokenizeInternal(
this: DefaultTokenizer,
input: string,
language?: TLanguage,
prop?: string,
): string[] {
/* c8 ignore next 3 */
if (typeof input !== "string") {
return [input];
}
let tokens: string[];
if (prop && this?.tokenizeSkipProperties?.has(prop)) {
// @ts-ignore
tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)];
} else {
tokens = tokenize(input);
}
const trimTokens = trim(tokens);
if (!this.allowDuplicates) {
return Array.from(new Set(trimTokens));
}
return trimTokens;
}
export function createTokenizer(
config: JapaneseTokenizerConfig = defaultConfig,
): DefaultTokenizer {
const tokenizerConfig = {
tokenize: tokenizeInternal,
language: config.language,
stemmerSkipProperties: new Set(
config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : [],
),
tokenizeSkipProperties: new Set(
config.tokenizeSkipProperties
? [config.tokenizeSkipProperties].flat()
: [],
),
stopWords: config.stopWords as string[] | undefined,
allowDuplicates: Boolean(config.allowDuplicates),
normalizeToken,
normalizationCache: new Map(),
};
// @ts-ignore
tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal);
return tokenizerConfig;
}