Skip to content

Commit

Permalink
Merge pull request #151 from WorksApplications/fix/disallow-empty-mor…
Browse files Browse the repository at this point in the history
…pheme

Disallow empty morpheme by default
  • Loading branch information
mh-northlander authored Nov 11, 2024
2 parents ee664ba + 6e45504 commit 03224f6
Show file tree
Hide file tree
Showing 10 changed files with 162 additions and 49 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ The `sudachi_tokenizer` tokenizer tokenizes input texts using Sudachi.
- A: The shortest units equivalent to the UniDic short unit
- Ex) 選挙,管理,委員,会
- discard\_punctuation: Select to discard punctuation or not. (bool, default: true)
- allow\_empty\_morpheme: Allow output morpheme to have an empty span. (bool, default: false)
- This happens when an input text contains a composite character (e.g. ㍿) and it is split into morphemes. If false (default), all split morphemes will contain the span of the character. If true, only the first morpheme will contain the span and the span of other morphemes can be empty.
- settings\_path: Sudachi setting file path. The path may be absolute or relative; relative paths are resolved with respect to es\_config. (string, default: null)
- resources\_path: Sudachi dictionary path. The path may be absolute or relative; relative paths are resolved with respect to es\_config. (string, default: null)
- additional_settings: Describes a configuration JSON string for Sudachi. This JSON string will be merged into the default configuration. If this property is set, `settings_path` will be overridden.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,52 +29,61 @@ class ConfigAdapter(anchor: PathAnchor, settings: Settings, env: Environment) {
private val basePath = resourcesPath(env, settings)
private val fullAnchor = PathAnchor.filesystem(basePath).andThen(anchor)

val discardPunctuation: Boolean = settings.getAsBoolean(PARAM_DISCARD_PUNCTUATION, true)
// default false to let every morpheme have non-null span in the input text
val allowEmptyMorpheme: Boolean = settings.getAsBoolean(PARAM_ALLOW_EMPTY_MORPHEME, false)
val mode = splitMode(settings)

val compiled: Config = run {
val base = settingsFile(settings)
val additional = settingsInlineString(settings, fullAnchor)
additional.withFallback(base).anchoredWith(fullAnchor)
}

val discardPunctuation: Boolean = settings.getAsBoolean(PARAM_DISCARD_PUNCTUATION, true)

val mode = splitMode(settings)

private fun settingsFile(settings: Settings): Config {
val settingsPath = settings.get(PARAM_SETTINGS_PATH)
return if (settingsPath == null) {
readDefaultConfig(basePath, fullAnchor)
} else {
val configObject = fullAnchor.resource<Any>(settingsPath)
Config.fromResource(configObject, fullAnchor)
}
val base =
if (settingsPath == null) {
readDefaultConfig(basePath, fullAnchor)
} else {
val configObject = fullAnchor.resource<Any>(settingsPath)
Config.fromResource(configObject, fullAnchor)
}
return base.allowEmptyMorpheme(allowEmptyMorpheme)
}

companion object {
const val PARAM_SPLIT_MODE_DEPRECATED = "mode"
const val PARAM_SPLIT_MODE = "split_mode"
const val PARAM_SETTINGS_PATH = "settings_path"
const val PARAM_RESOURCES_PATH = "resources_path"
const val PARAM_ADDITIONAL_SETTINGS = "additional_settings"
const val PARAM_DISCARD_PUNCTUATION = "discard_punctuation"
const val PARAM_ALLOW_EMPTY_MORPHEME = "allow_empty_morpheme"

const val DEFAULT_SETTINGS_FILENAME = "sudachi.json"
const val DEFAULT_RESOURCE_PATH = "sudachi"

private object SplitModeFlag : EnumFlag<SplitMode>("split_mode", SplitMode.C)
private object SplitModeFlag : EnumFlag<SplitMode>(PARAM_SPLIT_MODE, SplitMode.C)

@JvmStatic
fun splitMode(settings: Settings): SplitMode {
if (settings.get(PARAM_SPLIT_MODE_DEPRECATED, null) != null) {
throw IllegalArgumentException(
"Setting $PARAM_SPLIT_MODE_DEPRECATED is deprecated, use SudachiSplitFilter instead",
"Setting $PARAM_SPLIT_MODE_DEPRECATED is deprecated, use $PARAM_SPLIT_MODE instead",
)
}
return SplitModeFlag.get(settings)
}

@JvmStatic
fun resourcesPath(env: Environment, settings: Settings): Path {
return env.configFile().resolve(settings.get("resources_path", "sudachi"))
return env.configFile().resolve(settings.get(PARAM_RESOURCES_PATH, DEFAULT_RESOURCE_PATH))
}

private fun readDefaultConfig(root: Path, baseAnchor: PathAnchor): Config {
val anchor = PathAnchor.filesystem(root).andThen(baseAnchor)
val resolved = root.resolve("sudachi.json")
val resolved = root.resolve(DEFAULT_SETTINGS_FILENAME)
val exists =
try {
resolved.exists()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,8 @@ class SudachiTokenizerFactory(
}
}

private val mode = ConfigAdapter.splitMode(settings)

private val config = ConfigAdapter(service.anchor, settings, env)
private val mode = config.mode

private val dictionary by lazy { service.forConfig(config.compiled) }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,68 @@ class CustomAnalyzerTest : SearchEngineTestBase {
""".jsonSettings()
val analyzers = engine.indexAnalyzers(settings)
val basic = analyzers.get("sudachi_basic")
basic.assertTerms("東京に行く", "東京", "", "行く")
basic.assertTerms("東京に行く。", "東京", "", "行く")
}

@Test
fun discardPunctuationFalse() {
val settings =
"""
{
"index.analysis": {
"analyzer": {
"sudachi_basic": {
"type": "custom",
"tokenizer": "sudachi_tokenizer"
}
},
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"discard_punctuation": false
}
}
}
}
""".jsonSettings()
val analyzers = engine.indexAnalyzers(settings)
val basic = analyzers.get("sudachi_basic")
basic.assertTerms("東京に行く。", "東京", "", "行く", "")
}

@Test
fun allowEmptyMorphemeTrue() {
val settings =
"""
{
"index.analysis": {
"analyzer": {
"sudachi_basic": {
"type": "custom",
"tokenizer": "sudachi_tokenizer"
}
},
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"split_mode": "A",
"allow_empty_morpheme": true
}
}
}
}
""".jsonSettings()
val analyzers = engine.indexAnalyzers(settings)
val basic = analyzers.get("sudachi_basic")
basic.assertTerms("㍿に行く", "", "", "", "行く")
}

@Test
fun stoptagsEmpty() {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_basic": {
"type": "custom",
Expand All @@ -71,7 +124,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
},
"filter": {
"pos": {
"type": "sudachi_part_of_speech"
"type": "sudachi_part_of_speech"
}
}
}
Expand All @@ -87,7 +140,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_basic": {
"type": "custom",
Expand Down Expand Up @@ -122,7 +175,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
Expand Down Expand Up @@ -156,7 +209,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
Expand Down Expand Up @@ -190,7 +243,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
Expand Down Expand Up @@ -224,7 +277,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
Expand Down Expand Up @@ -259,7 +312,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
Expand Down Expand Up @@ -294,7 +347,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
Expand Down Expand Up @@ -329,7 +382,7 @@ class CustomAnalyzerTest : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi_test": {
"type": "custom",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ open class TestSudachiAnalysis : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi": {
"type": "sudachi",
Expand All @@ -71,7 +71,7 @@ open class TestSudachiAnalysis : SearchEngineTestBase {
val settings =
"""
{
"index.analysis": {
"index.analysis": {
"analyzer": {
"sudachi": {
"type": "sudachi",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,8 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
val tokenStream = setUpTokenStream("search", "六三四㍿に行くカ゛カ゛カ゛")
assertTokenStreamContents(
tokenStream,
arrayOf("六三四", "", "", "", "", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
arrayOf("六三四", "", "", "", "", "行く", "カ゛カ゛カ゛", "カ゛カ゛", "カ゛"),
intArrayOf(0, 3, 3, 3, 4, 5, 7, 7, 11),
intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),
Expand All @@ -219,7 +219,7 @@ class TestSudachiSplitFilter : BaseTokenStreamTestCase() {
assertTokenStreamContents(
tokenStream,
arrayOf("六三四", "株式会社", "株式", "会社", "", "行く", "ガガガ", "ガガ", ""),
intArrayOf(0, 3, 3, 4, 4, 5, 7, 7, 11),
intArrayOf(0, 3, 3, 3, 4, 5, 7, 7, 11),
intArrayOf(3, 4, 4, 4, 5, 7, 13, 11, 13),
intArrayOf(1, 1, 0, 1, 1, 1, 1, 0, 1),
intArrayOf(1, 2, 1, 1, 1, 1, 2, 1, 1),
Expand Down
Loading

0 comments on commit 03224f6

Please sign in to comment.