add test for dict.entries and fix

WorksApplications · Nov 27, 2024 · 02de218 · 02de218
1 parent d6a65d9
commit 02de218
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 25 deletions.
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
@@ -137,10 +137,14 @@ public void close() throws IOException {
      * Iterator of morphemes in the dictionary.
      */
     private class EntryItr implements Iterator<Morpheme> {
+        private final GrammarImpl grammar;
+        private final LexiconSet lexicon;
         private Iterator<Integer> wordIdItr;
 
         EntryItr() {
-            this.wordIdItr = getLexicon().wordIds();
+            this.grammar = getGrammar();
+            this.lexicon = getLexicon();
+            this.wordIdItr = this.lexicon.wordIds();
         }
 
         @Override
@@ -153,7 +157,7 @@ public Morpheme next() {
             if (!hasNext()) {
                 throw new NoSuchElementException();
             }
-            return new SingleMorphemeImpl(getGrammar(), getLexicon(), wordIdItr.next());
+            return new SingleMorphemeImpl(this.grammar, this.lexicon, wordIdItr.next());
         }
     }
 

diff --git a/src/main/java/com/worksap/nlp/sudachi/Morpheme.java b/src/main/java/com/worksap/nlp/sudachi/Morpheme.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,9 +123,10 @@ public interface Morpheme {
      * The IDs change when the dictionaries are updated or the combination of
      * dictionaries changes.
      *
-     * If the morpheme is OOV, it returns an undefined value.
+     * If the morpheme is OOV, it returns an id consist of OOV flag and pos id.
      *
      * @return the word ID
+     * @see WordId
      */
     public int getWordId();
 

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ public long parameters(int wordId) {
 
     private class Itr implements Iterator<int[]> {
         private final Iterator<int[]> iterator;
-        private Integer[] wordIds;
+        private int[] wordIds;
         private int length;
         private int index;
 
@@ -155,9 +155,9 @@ public Iterator<Integer> wordIds() {
     }
 
     private class WordIdItr implements Iterator<Integer> {
-        Iterator<Ints> iterator;
-        Ints ints;
-        int index;
+        private final Iterator<Ints> iterator;
+        private Ints ints;
+        private int index;
 
         WordIdItr() {
             this.iterator = getWordIdTable().wordIds();
@@ -166,7 +166,7 @@ private class WordIdItr implements Iterator<Integer> {
 
         @Override
         public boolean hasNext() {
-            if (ints == null || index >= ints.length()) {
+            while (ints == null || index >= ints.length()) {
                 if (!iterator.hasNext()) {
                     return false;
                 }

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Works Applications Co., Ltd.
+ * Copyright (c) 2017-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -202,8 +202,7 @@ public Integer next() {
             if (!hasNext()) {
                 throw new NoSuchElementException();
             }
-            int innerWordId = iterator.next();
-            return WordId.make(dictId, innerWordId);
+            return iterator.next();
         }
     }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,12 @@
 import java.util.NoSuchElementException;
 import java.util.Iterator;
 
+/**
+ * Table which contains the list of (internal) word ids that has same index
+ * form.
+ * 
+ * Automatically fills dict parts of word id using the dicId set.
+ */
 public class WordIdTable {
     private final ByteBuffer bytes;
     private int dicIdMask = 0;
@@ -31,19 +37,13 @@ public class WordIdTable {
         this.bytes = bytes;
     }
 
-    Integer[] get(int index) {
+    int[] get(int index) {
         ByteBuffer dup = bytes.duplicate();
         dup.position(index);
         BufReader reader = new BufReader(dup);
         int length = reader.readVarint32();
-        Integer[] result = new Integer[length];
-        int mask = dicIdMask;
-        int sum = 0;
-        for (int i = 0; i < length; i++) {
-            int v = reader.readVarint32();
-            result[i] = WordId.applyMask(v + sum, mask);
-            sum += v;
-        }
+        int[] result = new int[length];
+        readDeltaCompressed(result, length, this.dicIdMask, reader);
         return result;
     }
 
@@ -75,8 +75,8 @@ private static void readDeltaCompressed(int[] result, int count, int mask, BufRe
         }
     }
 
-    void setDictionaryId(int id) {
-        dicIdMask = WordId.dicIdMask(id);
+    void setDictionaryId(int dictId) {
+        dicIdMask = WordId.dicIdMask(dictId);
     }
 
     /**

diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.kt
@@ -103,6 +103,25 @@ class JapaneseDictionaryTest {
     assertFailsWith(IllegalStateException::class) { tok.tokenize("a") }
   }
 
+  @Test
+  fun entries() {
+    // contains all morphemes, where all of them have different wordId
+    assertEquals(41, dict.entries().map { m -> m.getWordId() }.distinct().count())
+    // use grammar
+    assertEquals(6, dict.entries().filter { m -> m.partOfSpeech().get(1) == "固有名詞" }.count())
+    // use lexicon
+    assertEquals(4, dict.entries().filter { m -> m.readingForm().contains("キョウ") }.count())
+  }
+
+  @Test
+  fun entriesWithUser() {
+    val udict = TestDictionary.user1()
+    assertEquals(41 + 4, udict.entries().map { m -> m.getWordId() }.distinct().count())
+    assertEquals(6 + 1, udict.entries().filter { m -> m.partOfSpeech().get(1) == "固有名詞" }.count())
+    assertEquals(4 + 1, udict.entries().filter { m -> m.readingForm().contains("キョウ") }.count())
+    udict.close()
+  }
+
   @Test
   fun lookupEntries() {
     // nothing