Skip to content

Commit

Permalink
add test for dict.entries and fix
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Nov 27, 2024
1 parent d6a65d9 commit 02de218
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 25 deletions.
8 changes: 6 additions & 2 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,14 @@ public void close() throws IOException {
* Iterator of morphemes in the dictionary.
*/
private class EntryItr implements Iterator<Morpheme> {
private final GrammarImpl grammar;
private final LexiconSet lexicon;
private Iterator<Integer> wordIdItr;

EntryItr() {
this.wordIdItr = getLexicon().wordIds();
this.grammar = getGrammar();
this.lexicon = getLexicon();
this.wordIdItr = this.lexicon.wordIds();
}

@Override
Expand All @@ -153,7 +157,7 @@ public Morpheme next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return new SingleMorphemeImpl(getGrammar(), getLexicon(), wordIdItr.next());
return new SingleMorphemeImpl(this.grammar, this.lexicon, wordIdItr.next());
}
}

Expand Down
5 changes: 3 additions & 2 deletions src/main/java/com/worksap/nlp/sudachi/Morpheme.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -123,9 +123,10 @@ public interface Morpheme {
* The IDs change when the dictionaries are updated or the combination of
* dictionaries changes.
*
* If the morpheme is OOV, it returns an undefined value.
* If the morpheme is OOV, it returns an id consist of OOV flag and pos id.
*
* @return the word ID
* @see WordId
*/
public int getWordId();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -105,7 +105,7 @@ public long parameters(int wordId) {

private class Itr implements Iterator<int[]> {
private final Iterator<int[]> iterator;
private Integer[] wordIds;
private int[] wordIds;
private int length;
private int index;

Expand Down Expand Up @@ -155,9 +155,9 @@ public Iterator<Integer> wordIds() {
}

private class WordIdItr implements Iterator<Integer> {
Iterator<Ints> iterator;
Ints ints;
int index;
private final Iterator<Ints> iterator;
private Ints ints;
private int index;

WordIdItr() {
this.iterator = getWordIdTable().wordIds();
Expand All @@ -166,7 +166,7 @@ private class WordIdItr implements Iterator<Integer> {

@Override
public boolean hasNext() {
if (ints == null || index >= ints.length()) {
while (ints == null || index >= ints.length()) {
if (!iterator.hasNext()) {
return false;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2022 Works Applications Co., Ltd.
* Copyright (c) 2017-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -202,8 +202,7 @@ public Integer next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
int innerWordId = iterator.next();
return WordId.make(dictId, innerWordId);
return iterator.next();
}
}
}
24 changes: 12 additions & 12 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,6 +23,12 @@
import java.util.NoSuchElementException;
import java.util.Iterator;

/**
* Table which contains the list of (internal) word ids that has same index
* form.
*
* Automatically fills dict parts of word id using the dicId set.
*/
public class WordIdTable {
private final ByteBuffer bytes;
private int dicIdMask = 0;
Expand All @@ -31,19 +37,13 @@ public class WordIdTable {
this.bytes = bytes;
}

Integer[] get(int index) {
int[] get(int index) {
ByteBuffer dup = bytes.duplicate();
dup.position(index);
BufReader reader = new BufReader(dup);
int length = reader.readVarint32();
Integer[] result = new Integer[length];
int mask = dicIdMask;
int sum = 0;
for (int i = 0; i < length; i++) {
int v = reader.readVarint32();
result[i] = WordId.applyMask(v + sum, mask);
sum += v;
}
int[] result = new int[length];
readDeltaCompressed(result, length, this.dicIdMask, reader);
return result;
}

Expand Down Expand Up @@ -75,8 +75,8 @@ private static void readDeltaCompressed(int[] result, int count, int mask, BufRe
}
}

void setDictionaryId(int id) {
dicIdMask = WordId.dicIdMask(id);
void setDictionaryId(int dictId) {
dicIdMask = WordId.dicIdMask(dictId);
}

/**
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,25 @@ class JapaneseDictionaryTest {
assertFailsWith(IllegalStateException::class) { tok.tokenize("a") }
}

@Test
fun entries() {
// contains all morphemes, where all of them have different wordId
assertEquals(41, dict.entries().map { m -> m.getWordId() }.distinct().count())
// use grammar
assertEquals(6, dict.entries().filter { m -> m.partOfSpeech().get(1) == "固有名詞" }.count())
// use lexicon
assertEquals(4, dict.entries().filter { m -> m.readingForm().contains("キョウ") }.count())
}

@Test
fun entriesWithUser() {
val udict = TestDictionary.user1()
assertEquals(41 + 4, udict.entries().map { m -> m.getWordId() }.distinct().count())
assertEquals(6 + 1, udict.entries().filter { m -> m.partOfSpeech().get(1) == "固有名詞" }.count())
assertEquals(4 + 1, udict.entries().filter { m -> m.readingForm().contains("キョウ") }.count())
udict.close()
}

@Test
fun lookupEntries() {
// nothing
Expand Down

0 comments on commit 02de218

Please sign in to comment.