Merge pull request #151 from WorksApplications/fix/disallow-empty-mor…

…pheme Disallow empty morpheme by default
WorksApplications · Nov 11, 2024 · 03224f6 · 03224f6
2 parents ee664ba + 6e45504
commit 03224f6
Show file tree

Hide file tree

Showing 10 changed files with 162 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -102,6 +102,8 @@ The `sudachi_tokenizer` tokenizer tokenizes input texts using Sudachi.
   - A: The shortest units equivalent to the UniDic short unit
       - Ex) 選挙,管理,委員,会
 - discard\_punctuation: Select to discard punctuation or not. (bool, default: true)
+- allow\_empty\_morpheme: Allow output morpheme to have an empty span. (bool, default: false)
+  - This happens when an input text contains a composite character (e.g. ㍿) and it is split into morphemes. If false (default), all split morphemes will contain the span of the character. If true, only the first morpheme will contain the span and the span of other morphemes can be empty.
 - settings\_path: Sudachi setting file path. The path may be absolute or relative; relative paths are resolved with respect to es\_config. (string, default: null)
 - resources\_path: Sudachi dictionary path. The path may be absolute or relative; relative paths are resolved with respect to es\_config. (string, default: null)
 - additional_settings: Describes a configuration JSON string for Sudachi. This JSON string will be merged into the default configuration. If this property is set, `settings_path` will be overridden.

diff --git a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/ConfigAdapter.kt
@@ -29,52 +29,61 @@ class ConfigAdapter(anchor: PathAnchor, settings: Settings, env: Environment) {
   private val basePath = resourcesPath(env, settings)
   private val fullAnchor = PathAnchor.filesystem(basePath).andThen(anchor)
 
+  val discardPunctuation: Boolean = settings.getAsBoolean(PARAM_DISCARD_PUNCTUATION, true)
+  // default false to let every morpheme have non-null span in the input text
+  val allowEmptyMorpheme: Boolean = settings.getAsBoolean(PARAM_ALLOW_EMPTY_MORPHEME, false)
+  val mode = splitMode(settings)
+
   val compiled: Config = run {
     val base = settingsFile(settings)
     val additional = settingsInlineString(settings, fullAnchor)
     additional.withFallback(base).anchoredWith(fullAnchor)
   }
 
-  val discardPunctuation: Boolean = settings.getAsBoolean(PARAM_DISCARD_PUNCTUATION, true)
-
-  val mode = splitMode(settings)
-
   private fun settingsFile(settings: Settings): Config {
     val settingsPath = settings.get(PARAM_SETTINGS_PATH)
-    return if (settingsPath == null) {
-      readDefaultConfig(basePath, fullAnchor)
-    } else {
-      val configObject = fullAnchor.resource<Any>(settingsPath)
-      Config.fromResource(configObject, fullAnchor)
-    }
+    val base =
+        if (settingsPath == null) {
+          readDefaultConfig(basePath, fullAnchor)
+        } else {
+          val configObject = fullAnchor.resource<Any>(settingsPath)
+          Config.fromResource(configObject, fullAnchor)
+        }
+    return base.allowEmptyMorpheme(allowEmptyMorpheme)
   }
 
   companion object {
     const val PARAM_SPLIT_MODE_DEPRECATED = "mode"
+    const val PARAM_SPLIT_MODE = "split_mode"
     const val PARAM_SETTINGS_PATH = "settings_path"
+    const val PARAM_RESOURCES_PATH = "resources_path"
     const val PARAM_ADDITIONAL_SETTINGS = "additional_settings"
     const val PARAM_DISCARD_PUNCTUATION = "discard_punctuation"
+    const val PARAM_ALLOW_EMPTY_MORPHEME = "allow_empty_morpheme"
+
+    const val DEFAULT_SETTINGS_FILENAME = "sudachi.json"
+    const val DEFAULT_RESOURCE_PATH = "sudachi"
 
-    private object SplitModeFlag : EnumFlag<SplitMode>("split_mode", SplitMode.C)
+    private object SplitModeFlag : EnumFlag<SplitMode>(PARAM_SPLIT_MODE, SplitMode.C)
 
     @JvmStatic
     fun splitMode(settings: Settings): SplitMode {
       if (settings.get(PARAM_SPLIT_MODE_DEPRECATED, null) != null) {
         throw IllegalArgumentException(
-            "Setting $PARAM_SPLIT_MODE_DEPRECATED is deprecated, use SudachiSplitFilter instead",
+            "Setting $PARAM_SPLIT_MODE_DEPRECATED is deprecated, use $PARAM_SPLIT_MODE instead",
         )
       }
       return SplitModeFlag.get(settings)
     }
 
     @JvmStatic
     fun resourcesPath(env: Environment, settings: Settings): Path {
-      return env.configFile().resolve(settings.get("resources_path", "sudachi"))
+      return env.configFile().resolve(settings.get(PARAM_RESOURCES_PATH, DEFAULT_RESOURCE_PATH))
     }
 
     private fun readDefaultConfig(root: Path, baseAnchor: PathAnchor): Config {
       val anchor = PathAnchor.filesystem(root).andThen(baseAnchor)
-      val resolved = root.resolve("sudachi.json")
+      val resolved = root.resolve(DEFAULT_SETTINGS_FILENAME)
       val exists =
           try {
             resolved.exists()

diff --git a/src/main/java/com/worksap/nlp/elasticsearch/sudachi/index/SudachiTokenizerFactory.kt b/src/main/java/com/worksap/nlp/elasticsearch/sudachi/index/SudachiTokenizerFactory.kt
@@ -50,9 +50,8 @@ class SudachiTokenizerFactory(
     }
   }
 
-  private val mode = ConfigAdapter.splitMode(settings)
-
   private val config = ConfigAdapter(service.anchor, settings, env)
+  private val mode = config.mode
 
   private val dictionary by lazy { service.forConfig(config.compiled) }
 

diff --git a/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt b/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/CustomAnalyzerTest.kt
@@ -45,15 +45,68 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     """.jsonSettings()
     val analyzers = engine.indexAnalyzers(settings)
     val basic = analyzers.get("sudachi_basic")
-    basic.assertTerms("東京に行く", "東京", "に", "行く")
+    basic.assertTerms("東京に行く。", "東京", "に", "行く")
+  }
+
+  @Test
+  fun discardPunctuationFalse() {
+    val settings =
+        """
+      {
+        "index.analysis": {
+          "analyzer": {
+            "sudachi_basic": {
+              "type": "custom",
+              "tokenizer": "sudachi_tokenizer"
+            }
+          },
+          "tokenizer": {
+            "sudachi_tokenizer": {
+              "type": "sudachi_tokenizer",
+              "discard_punctuation": false
+            }
+          }
+        }
+      }
+    """.jsonSettings()
+    val analyzers = engine.indexAnalyzers(settings)
+    val basic = analyzers.get("sudachi_basic")
+    basic.assertTerms("東京に行く。", "東京", "に", "行く", "。")
+  }
+
+  @Test
+  fun allowEmptyMorphemeTrue() {
+    val settings =
+        """
+      {
+        "index.analysis": {
+          "analyzer": {
+            "sudachi_basic": {
+              "type": "custom",
+              "tokenizer": "sudachi_tokenizer"
+            }
+          },
+          "tokenizer": {
+            "sudachi_tokenizer": {
+              "type": "sudachi_tokenizer",
+              "split_mode": "A",
+              "allow_empty_morpheme": true
+            }
+          }
+        }
+      }
+    """.jsonSettings()
+    val analyzers = engine.indexAnalyzers(settings)
+    val basic = analyzers.get("sudachi_basic")
+    basic.assertTerms("㍿に行く", "㍿", "", "に", "行く")
   }
 
   @Test
   fun stoptagsEmpty() {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_basic": {
               "type": "custom",
@@ -71,7 +124,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
           },
           "filter": {
             "pos": {
-              "type": "sudachi_part_of_speech"              
+              "type": "sudachi_part_of_speech"
             }
           }
         }
@@ -87,7 +140,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_basic": {
               "type": "custom",
@@ -122,7 +175,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_test": {
               "type": "custom",
@@ -156,7 +209,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_test": {
               "type": "custom",
@@ -190,7 +243,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_test": {
               "type": "custom",
@@ -224,7 +277,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_test": {
               "type": "custom",
@@ -259,7 +312,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_test": {
               "type": "custom",
@@ -294,7 +347,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_test": {
               "type": "custom",
@@ -329,7 +382,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi_test": {
               "type": "custom",

diff --git a/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/TestSudachiAnalysis.kt b/src/test/java/com/worksap/nlp/elasticsearch/sudachi/index/TestSudachiAnalysis.kt
@@ -51,7 +51,7 @@ open class TestSudachiAnalysis : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi": {
               "type": "sudachi",
@@ -71,7 +71,7 @@ open class TestSudachiAnalysis : SearchEngineTestBase {
     val settings =
         """
       {
-        "index.analysis": {          
+        "index.analysis": {
           "analyzer": {
             "sudachi": {
               "type": "sudachi",

diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiSplitFilter.kt
@@ -201,8 +201,8 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
     val tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛")
     assertTokenStreamContents(
         tokenStream,
-        arrayOf("六三四", "㍿", "㍿", "", "に", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
-        intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
+        arrayOf("六三四", "㍿", "㍿", "㍿", "に", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
+        intArrayOf(0, 3, 3, 3, 4, 5, 7, 7, 11),
         intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
         intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
         intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),
@@ -219,7 +219,7 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
     assertTokenStreamContents(
         tokenStream,
         arrayOf("六三四", "株式会社", "株式", "会社", "に", "行く", "ガガガ", "ガガ", "ガ"),
-        intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
+        intArrayOf(0, 3, 3, 3, 4, 5, 7, 7, 11),
         intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
         intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
         intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),