Skip to content

Commit

Permalink
update the use of dict.create -> dict.tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Nov 20, 2024
1 parent 4f66095 commit a349c54
Show file tree
Hide file tree
Showing 12 changed files with 29 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ public static void main(String[] args) throws IOException {
try (PrintStream output = outputFileName == null ? new FileOrStdoutPrintStream()
: new FileOrStdoutPrintStream(outputFileName);
Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
if (isEnableDump) {
tokenizer.setDumpOutput(output);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public void tearDown() throws IOException {

@Test
public void create() {
assertThat(dict.create(), isA(Tokenizer.class));
assertThat(dict.tokenizer(), isA(Tokenizer.class));
}

@Test
Expand All @@ -64,7 +64,7 @@ public void instantiateConfigWithoutCharDef() throws IOException {
cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
assertThat(jd, notNullValue());
assertThat(jd.create(), notNullValue());
assertThat(jd.tokenizer(), notNullValue());
}
}

Expand All @@ -79,14 +79,14 @@ private JapaneseDictionary makeDictionaryIncorrectly() throws IOException {
@Test(expected = IllegalStateException.class)
public void throwExceptionOnDictionaryUsageAfterClose() throws IOException {
JapaneseDictionary dic = makeDictionaryIncorrectly();
Tokenizer ignored = dic.create();
Tokenizer ignored = dic.tokenizer();
}

private Tokenizer makeTokenizerIncorrectly() throws IOException {
Config cfg = Config.fromClasspath("sudachi_minimum.json");
cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
return jd.create();
return jd.tokenizer();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class JapaneseTokenizerMaskTest {
cfg0.addOovProviderPlugin(SimpleOovProviderPlugin::class.java)
val cfg = cfg0.withFallback(TestDictionary.user0Cfg())
val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
val tokenizer = dic.create()
val tokenizer = dic.tokenizer()

assertEquals(2, dic.oovProviderPlugins.size)
assertIs<CaptureOtherWords>(dic.oovProviderPlugins[0])
Expand All @@ -62,7 +62,7 @@ class JapaneseTokenizerMaskTest {
val cfg = TestDictionary.user0Cfg()
cfg.addOovProviderPlugin(CaptureOtherWords::class.java)
val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
val tokenizer = dic.create()
val tokenizer = dic.tokenizer()

assertIs<SimpleOovProviderPlugin>(dic.oovProviderPlugins[0])
assertIs<CaptureOtherWords>(dic.oovProviderPlugins[1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import kotlin.test.assertEquals
import kotlin.test.assertFailsWith

class JapaneseTokenizerStreamingTest {
private val tokenizer = TestDictionary.user0().create()
private val tokenizer = TestDictionary.user0().tokenizer()

class BadReader(private val data: String, private val window: Int = 512) : Reader() {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public class JapaneseTokenizerTest {
@Before
public void setUp() {
dict = TestDictionary.INSTANCE.user1();
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();
}

private static Matcher<Morpheme> morpheme(String surface, int begin, int end) {
Expand Down Expand Up @@ -353,7 +353,7 @@ public void zeroLengthMorpheme() {
public void disableEmptyMorpheme() throws IOException {
Config config = TestDictionary.INSTANCE.user1Cfg();
dict = new DictionaryFactory().create(Config.empty().withFallback(config).allowEmptyMorpheme(false));
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();

List<Morpheme> s = tokenizer.tokenize("…");
assertThat(s.size(), is(3));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public class JoinKatakanaOovPluginTest {
@Before
public void setUp() throws IOException {
Dictionary dict = TestDictionary.INSTANCE.user1();
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();
plugin = new JoinKatakanaOovPlugin();
plugin.setOovFactory((short) -1);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public void setUp() throws IOException {
Config config = TestDictionary.INSTANCE.user0Cfg()
.characterDefinition(getClass().getClassLoader().getResource("joinnumeric/char.def"));
Dictionary dict = new DictionaryFactory().create(config);
tokenizer = (JapaneseTokenizer) dict.create();
tokenizer = (JapaneseTokenizer) dict.tokenizer();

plugin = new JoinNumericPlugin();
plugin.setSettings(Settings.parse("{}", PathAnchor.none()));
Expand Down
10 changes: 5 additions & 5 deletions src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class MorphemeImplTest {
fun useToString() {
val dic = TestDictionary.user0()
// should be split into す/だ/ち, all of them are OOV
val sudachi = dic.create().tokenize("すだち")
val sudachi = dic.tokenizer().tokenize("すだち")
// wid of OOV is (0xf, posId)
assertEquals(
"MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(15,4)}",
Expand All @@ -36,20 +36,20 @@ class MorphemeImplTest {
fun userdata() {
// system
val sdic = TestDictionary.user0()
val tokyo = sdic.create().tokenize("東京")
val tokyo = sdic.tokenizer().tokenize("東京")
assertTrue(tokyo[0].getUserData().isEmpty())

// oov
val oovs = sdic.create().tokenize("すだち")
val oovs = sdic.tokenizer().tokenize("すだち")
assertTrue(oovs[0].getUserData().isEmpty())

// user with data
val udic = TestDictionary.user1()
val sudachi = udic.create().tokenize("すだち")
val sudachi = udic.tokenizer().tokenize("すだち")
assertEquals("徳島県産", sudachi[0].getUserData())

// user without data
val piraru = udic.create().tokenize("ぴらる")
val piraru = udic.tokenizer().tokenize("ぴらる")
assertTrue(piraru[0].getUserData().isEmpty())
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class OovProviderPluginTest {
val dict = DictionaryFactory().create(cfg) as JapaneseDictionary
val plugin = assertIs<FakeOovProvider>(dict.oovProviderPlugins.last())
assertEquals(8, plugin.posId)
val tokinzer = dict.create()
val tokinzer = dict.tokenizer()
val tokens = tokinzer.tokenize("すだちかぼす")
assertEquals("スダチ", tokens[0].partOfSpeech()[5])
assertEquals("カボス", tokens[1].partOfSpeech()[5])
Expand Down
11 changes: 6 additions & 5 deletions src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ import kotlin.test.*
class PosMatcherTest {

private val dic = DictionaryFactory().create(TestDictionary.user2Cfg()) as JapaneseDictionary
private val tok = dic.tokenizer()

@Test
fun basic() {
val nouns = dic.posMatcher(PartialPOS("名詞"))
val morphs = dic.create().tokenize("京都に行った")
val morphs = tok.tokenize("京都に行った")
assertEquals(4, morphs.size)
assertTrue(nouns.test(morphs[0]))
assertFalse(nouns.test(morphs[1]))
Expand All @@ -37,7 +38,7 @@ class PosMatcherTest {
@Test
fun userDic() {
val filter = dic.posMatcher { it[3] == "ミカン科" }
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertTrue(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -49,7 +50,7 @@ class PosMatcherTest {
val f1 = dic.posMatcher { it[5] == "スダチ" }
val f2 = dic.posMatcher { it[5] == "カボス" }
val filter = f1.union(f2)
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertTrue(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -61,7 +62,7 @@ class PosMatcherTest {
val f1 = dic.posMatcher { it[5] == "終止形-一般" }
val f2 = dic.posMatcher { it[0] == "動詞" }
val filter = f1.intersection(f2)
val morphs = dic.create().tokenize("いった東京行く")
val morphs = tok.tokenize("いった東京行く")
assertEquals(4, morphs.size)
assertFalse(filter.test(morphs[0]))
assertFalse(filter.test(morphs[1]))
Expand All @@ -72,7 +73,7 @@ class PosMatcherTest {
@Test
fun invert() {
val filter = dic.posMatcher { it[3] == "ミカン科" }.invert()
val morphs = dic.create().tokenize("すだちにかぼす")
val morphs = tok.tokenize("すだちにかぼす")
assertEquals(3, morphs.size)
assertFalse(filter.test(morphs[0]))
assertTrue(filter.test(morphs[1]))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class RegexOovProviderTest {
.addList("pos", "名詞", "普通名詞", "一般", "*", "*", "*")
@Suppress("UNCHECKED_CAST") block(cfg, pluginCfg as Config.PluginConf<RegexOovProvider>)
// prepend our OOV configuration to the main configuration
return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).create()
return DictionaryFactory().create(cfg.withFallback(TestDictionary.user0Cfg())).tokenizer()
}

@Test
Expand Down
8 changes: 4 additions & 4 deletions src/test/java/com/worksap/nlp/sudachi/UserDictionaryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void fullUserDict() throws IOException {
config.addUserDictionary(instance.getUserDict2());

try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("ぴさる");
assertThat(morphs.size(), is(1));
Morpheme m = morphs.get(0);
Expand All @@ -62,7 +62,7 @@ public void splitForUserDict() throws IOException {
TestDictionary td = TestDictionary.INSTANCE;
Config config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1());
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("東京府");
assertThat(morphs.size(), is(1));
Morpheme m = morphs.get(0);
Expand All @@ -77,7 +77,7 @@ public void splitForUserDict() throws IOException {
public void userDefinedPos() throws IOException {
Config config = TestDictionary.INSTANCE.user2Cfg();
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("すだちかぼす");
assertThat(morphs.size(), is(2));
Morpheme m = morphs.get(0);
Expand All @@ -89,7 +89,7 @@ public void userDefinedPos() throws IOException {
TestDictionary td = TestDictionary.INSTANCE;
config = td.user0Cfg().addUserDictionary(td.getUserDict2()).addUserDictionary(td.getUserDict1());
try (Dictionary dict = new DictionaryFactory().create(config)) {
Tokenizer tokenizer = dict.create();
Tokenizer tokenizer = dict.tokenizer();
List<Morpheme> morphs = tokenizer.tokenize("すだちかぼす");
assertThat(morphs.size(), is(2));
Morpheme m = morphs.get(0);
Expand Down

0 comments on commit a349c54

Please sign in to comment.