diff --git a/README.md b/README.md index 33abd90..85b3dd5 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Lexical Analysis Command-Line Tool for lemmatizing, lexical analysis and languag Program help: ``` -las 1.4.1 +las 1.4.5 Usage: las [lemmatize|analyze|inflect|recognize|identify] [options] [...] Command: lemmatize @@ -13,7 +13,7 @@ Command: analyze Command: inflect (locales: de, en, fi, fr, it, liv, mdf, mhr, mrj, myv, sme, sv, tr, udm) Command: recognize -report recognition rate (locales: de, en, fi, fr, it, liv, mdf, mhr, mrj, myv, sme, sv, tr, udm, la +report word recognition rate (locales: de, en, fi, fr, it, liv, mdf, mhr, mrj, myv, sme, sv, tr, udm, la Command: identify identify language (locales: zh-TW, fi, no, hr, ta, ar, fr, is, lv, eu, mt, bn, dk, uk, pa, ga, br, so, pt, cs, fr, gl, sr, zh-CN, mrj, el, it, ca, vi, tl, nl, bg, ko, liv, it, mk, oc, et, af, de, ru, yi, cy, en, udm, ur, mdf, myv, sme, ru, ht, ml, th, id, sq, sv, de, sv, tr, da, en, gu, he, es, kn, sk, es, hi, te, mr, an, sw, be, pt, nl, ja, ast, fi, ro, mhr, ne, lt, no, km, sl, fa, ms, hu, pl, la, tr) --locale @@ -21,11 +21,13 @@ identify language (locales: zh-TW, fi, no, hr, ta, ar, fr, is, lv, eu, mt, bn, d --forms inclection forms for inflect/analyze --segment - segment compound words? + segment baseforms? --no-guess Don't guess baseforms for unknown words? --no-segment-guessed Don't guess segmentation information for guessed words (speeds up processing significantly)? + --process-by + Analysis unit when processing files (file, paragraph, line)? --max-edit-distance Maximum edit distance for error-correcting unidentified words (default 0)? --no-pretty diff --git a/build.sbt b/build.sbt index 01378e8..80dfb98 100644 --- a/build.sbt +++ b/build.sbt @@ -5,7 +5,7 @@ version := "1.4.2" scalaVersion := "2.11.8" libraryDependencies ++= Seq( - "fi.seco" % "lexicalanalysis" % "1.4.4", + "fi.seco" % "lexicalanalysis" % "1.4.5", "com.optimaize.languagedetector" % "language-detector" % "0.5", "com.github.scopt" %% "scopt" % "3.4.0", "com.typesafe.play" %% "play-json" % "2.5.3", @@ -24,6 +24,6 @@ assemblyMergeStrategy in assembly := { oldStrategy(x) } -assemblyOption in assembly := (assemblyOption in assembly).value.copy(prependShellScript = Some(Seq("#!/usr/bin/env sh", """exec java -jar "$0" "$@""""))) +assemblyOption in assembly := (assemblyOption in assembly).value.copy(prependShellScript = Some(Seq("#!/usr/bin/env sh", """exec java -jar -Xmx4G "$0" "$@""""))) assemblyJarName in assembly := "las" diff --git a/src/main/scala/LASCommandLineTool.scala b/src/main/scala/LASCommandLineTool.scala index 03ca31e..320fd79 100644 --- a/src/main/scala/LASCommandLineTool.scala +++ b/src/main/scala/LASCommandLineTool.scala @@ -31,10 +31,10 @@ object LASCommandLineTool { lazy val compoundlas = new CompoundLexicalAnalysisService(combinedlas, snowballlas) object LanguageDetector extends LazyLogging { - val languageProfiles = new LanguageProfileReader().readAllBuiltIn() - val supportedLanguages = languageProfiles.map(_.getLocale.toString()) - val detector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles).build() - val textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText() + lazy val languageProfiles = new LanguageProfileReader().readAllBuiltIn() + lazy val supportedLanguages = languageProfiles.map(_.getLocale.toString()) + lazy val detector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles).build() + lazy val textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText() def apply(text: String) = detector.getProbabilities(textObjectFactory.forText(text)) } @@ -54,7 +54,7 @@ object LASCommandLineTool { def main(args: Array[String]) = { val parser = new scopt.OptionParser[Config]("las") { - head("las", "1.4.2") + head("las", "1.4.5") cmd("lemmatize") action { (_, c) => c.copy(action = Action.Lemmatize) } text (s"(locales: ${compoundlas.getSupportedBaseformLocales.mkString(", ")})")