Skip to content

Commit

Permalink
add multiple builds for different language and functionality combinat…
Browse files Browse the repository at this point in the history
…ions
  • Loading branch information
jiemakel committed Jan 9, 2018
1 parent 980f5a2 commit fd7a65b
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 55 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
/project/target/
/target/
/bin/
/build/
/dist/
/.cache
/.classpath
/.project
Expand Down
110 changes: 91 additions & 19 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,29 +1,101 @@
name := """las"""

version := "1.5.9"

scalaVersion := "2.12.3"

libraryDependencies ++= Seq(
"fi.seco" % "lexicalanalysis" % "1.5.11",
lazy val commonSettings = Seq(
organization := "fi.seco",
version := "1.5.14",
scalaVersion := "2.12.4",
libraryDependencies ++= Seq(
"fi.seco" % "lexicalanalysis" % "1.5.14",
"com.optimaize.languagedetector" % "language-detector" % "0.6",
"com.github.scopt" %% "scopt" % "3.5.0",
"com.typesafe.play" %% "play-json" % "2.6.0-M3",
"com.typesafe.scala-logging" %% "scala-logging" % "3.5.0",
"ch.qos.logback" % "logback-classic" % "1.2.2" % "runtime"
),
resolvers ++= Seq(
"Local Maven Repository" at Path.userHome.asFile.toURI.toURL + ".m2/repository"),
fork in run := true,
)

lazy val rootSettings = Seq(
publishArtifact := false,
publishArtifact in Test := false,
)

lazy val assemblySettings = Seq(
test in assembly := {},
assemblyMergeStrategy in assembly := {
case "is2/util/DB.class" => MergeStrategy.first
case "fi/seco/lexical/hfst/resources.lst" => MergeStrategy.filterDistinctLines
case other: Any => MergeStrategy.defaultMergeStrategy(other)
},
mainClass in assembly := Some("LASCommandLineTool"),
assemblyOption in assembly := (assemblyOption in assembly).value.copy(prependShellScript = Some(Seq("#!/usr/bin/env sh", """exec java -jar "$0" "$@"""" + "\n")))
)
resolvers ++= Seq(
"Local Maven Repository" at Path.userHome.asFile.toURI.toURL + ".m2/repository")

fork in run := true
lazy val main = project.in(file("build/main"))
.settings(commonSettings:_*)
.settings(scalaSource in Compile := baseDirectory.value / "../../src/main/scala")
.disablePlugins(AssemblyPlugin)

lazy val fiComplete = (project in file("build/las-fi"))
.settings(commonSettings:_*)
.settings(assemblySettings:_*)
.dependsOn(main)
.settings(
name := "las-fi",
libraryDependencies += "fi.seco" % "lexicalanalysis-resources-fi-complete" % "1.5.14",
assemblyOutputPath in assembly := file("dist/las-fi")
)

lazy val fiSmall = (project in file("build/las-fi-small"))
.settings(commonSettings:_*)
.settings(assemblySettings:_*)
.dependsOn(main)
.settings(
name := "las-fi-small",
libraryDependencies += "fi.seco" % "lexicalanalysis-resources-fi-core" % "1.5.14",
assemblyOutputPath in assembly := file("dist/las-fi-small")
)

lazy val other = (project in file("build/las-non-fi"))
.settings(commonSettings:_*)
.settings(assemblySettings:_*)
.dependsOn(main)
.settings(
name := "las-non-fi",
libraryDependencies += "fi.seco" % "lexicalanalysis-resources-other" % "1.5.14",
assemblyOutputPath in assembly := file("dist/las-non-fi")
)

lazy val smallComplete = (project in file("build/las-small"))
.settings(commonSettings:_*)
.settings(assemblySettings:_*)
.dependsOn(main)
.settings(
name := "las-complete",
libraryDependencies ++= Seq(
"fi.seco" % "lexicalanalysis-resources-fi-core" % "1.5.14",
"fi.seco" % "lexicalanalysis-resources-other" % "1.5.14"
),
assemblyOutputPath in assembly := file("dist/las-small")
)


assemblyMergeStrategy in assembly := {
case "is2/util/DB.class" => MergeStrategy.first
case x =>
val oldStrategy = (assemblyMergeStrategy in assembly).value
oldStrategy(x)
}
lazy val complete = (project in file("build/las-complete"))
.settings(commonSettings:_*)
.settings(assemblySettings:_*)
.dependsOn(main)
.settings(
name := "las-complete",
libraryDependencies ++= Seq(
"fi.seco" % "lexicalanalysis-resources-fi-complete" % "1.5.14",
"fi.seco" % "lexicalanalysis-resources-other" % "1.5.14"
),
assemblyOutputPath in assembly := file("dist/las")
)

assemblyOption in assembly := (assemblyOption in assembly).value.copy(prependShellScript = Some(Seq("#!/usr/bin/env sh", """exec java -jar -Xmx4G "$0" "$@"""" + "\n")))
lazy val las = project.in(file("."))
.settings(commonSettings:_*)
.settings(rootSettings:_*)
.disablePlugins(AssemblyPlugin)
.aggregate(complete,fiSmall,fiComplete,other,smallComplete)

assemblyJarName in assembly := "las"
105 changes: 69 additions & 36 deletions src/main/scala/LASCommandLineTool.scala
Original file line number Diff line number Diff line change
@@ -1,31 +1,24 @@
import fi.seco.lexical.hfst.HFSTLexicalAnalysisService
import fi.seco.lexical.combined.CombinedLexicalAnalysisService
import fi.seco.lexical.SnowballLexicalAnalysisService
import fi.seco.lexical.CompoundLexicalAnalysisService
import com.typesafe.scalalogging.LazyLogging
import fi.seco.lexical.LanguageRecognizer
import scala.collection.convert.WrapAsScala._
import scala.collection.convert.WrapAsJava._
import scala.util.Try
import java.io.{File, PrintWriter}
import java.util.Locale
import java.util.HashMap
import java.io.File
import scala.io.StdIn
import scala.io.Source
import java.io.PrintWriter
import play.api.libs.json.JsValue
import play.api.libs.json.Json
import fi.seco.lexical.hfst.HFSTLexicalAnalysisService.WordToResults
import play.api.libs.json.Writes
import java.util.Collections

import com.optimaize.langdetect.LanguageDetectorBuilder
import com.optimaize.langdetect.ngram.NgramExtractors
import com.optimaize.langdetect.profiles.LanguageProfileReader
import com.optimaize.langdetect.text.CommonTextObjectFactories
import com.typesafe.scalalogging.LazyLogging
import fi.seco.lexical.{CompoundLexicalAnalysisService, LanguageRecognizer, SnowballLexicalAnalysisService}
import fi.seco.lexical.combined.CombinedLexicalAnalysisService
import fi.seco.lexical.hfst.HFSTLexicalAnalysisService
import fi.seco.lexical.hfst.HFSTLexicalAnalysisService.WordToResults
import play.api.libs.json.{JsValue, Json, Writes}

import scala.collection.convert.WrapAsJava._
import scala.collection.convert.WrapAsScala._
import scala.io.{Source, StdIn}
import scala.util.Try

object LASCommandLineTool {

lazy val hfstlas = new HFSTLexicalAnalysisService
lazy val combinedlas = new CombinedLexicalAnalysisService
lazy val snowballlas = new SnowballLexicalAnalysisService
lazy val compoundlas = new CompoundLexicalAnalysisService(combinedlas, snowballlas)
Expand All @@ -45,7 +38,7 @@ object LASCommandLineTool {

object Action extends Enumeration {
type Action = Value
val Inflect, Lemmatize, Analyze, Detect, Recognize = Value
val Inflect, Lemmatize, Analyze, Detect, Recognize, Hyphenate = Value
}

implicit val actionRead: scopt.Read[Action.Value] = scopt.Read.reads(Action withName _)
Expand All @@ -54,7 +47,7 @@ object LASCommandLineTool {

def main(args: Array[String]) = {
val parser = new scopt.OptionParser[Config]("las") {
head("las", "1.5.9")
head("las", "1.5.13")
cmd("lemmatize") action { (_, c) =>
c.copy(action = Action.Lemmatize)
} text (s"(locales: ${compoundlas.getSupportedBaseformLocales.mkString(", ")})")
Expand All @@ -66,10 +59,13 @@ object LASCommandLineTool {
} text (s"(locales: ${combinedlas.getSupportedInflectionLocales.mkString(", ")})")
cmd("recognize") action { (_, c) =>
c.copy(action = Action.Recognize)
} text (s"report word recognition rate (locales: ${combinedlas.getSupportedAnalyzeLocales.mkString(", ")}")
} text (s"report word recognition rate (locales: ${combinedlas.getSupportedAnalyzeLocales.mkString(", ")})")
cmd("identify") action { (_, c) =>
c.copy(action = Action.Detect)
} text (s"identify language (locales: ${(LanguageRecognizer.getAvailableLanguages ++ LanguageDetector.supportedLanguages ++ compoundlas.getSupportedBaseformLocales).toSet.mkString(", ")})")
cmd("hyphenate") action { (_, c) =>
c.copy(action = Action.Hyphenate)
} text (s"hyphenate (locales: ${combinedlas.getSupportedHyphenationLocales.mkString(", ")})")
opt[Seq[String]]("locale") optional () action { (x, c) =>
c.copy(locale = x)
} text ("possible locales")
Expand All @@ -95,18 +91,43 @@ object LASCommandLineTool {
c.copy(maxEditDistance = x)
} text ("Maximum edit distance for error-correcting unidentified words (default 0)?")
opt[Unit]("no-pretty") action { (_, c) =>
c.copy(pretty = true)
c.copy(pretty = false)
} text ("Don't pretty print json?")
arg[String]("<file>...") unbounded () optional () action { (x, c) =>
c.copy(files = c.files :+ x)
} text ("files to process (stdin if not given)")
help("help") text("prints this usage text")
checkConfig { c => if (c.action==null) failure("specify at least an action (lemmatize, analyze, inflect or identify)") else success }
checkConfig { c => if (c.action==null) failure("specify at least an action (lemmatize, analyze, inflect, recognize, identify or hyphenate)") else success }
}
// parser.parse returns Option[C]
parser.parse(args, Config()) match {
case Some(config) =>
config.action match {
case Action.Hyphenate => if (!config.files.isEmpty) for (file <- config.files) {
val writer = new PrintWriter(new File(file+".hyphenated"))
val paragraphs = config.processBy match {
case ProcessBy.File => Seq(Source.fromFile(file).mkString)
case ProcessBy.Paragraph => Source.fromFile(file).mkString.split("\\s*\n\\s*\n").toSeq
case ProcessBy.Line => Source.fromFile(file).mkString.split("\n").toSeq
}
var i = 0
for (paragraph <- paragraphs) {
val hyphenated = hyphenate(paragraph, config.locale).getOrElse(paragraph)
writer.write(hyphenated)
i += 1
if (i!=paragraphs.length) {
writer.write("\n")
if (config.processBy == ProcessBy.Paragraph) writer.write("\n")
}
}
writer.close()
} else {
var text = StdIn.readLine()
while (text != null) {
println(hyphenate(text, config.locale).getOrElse(text));
text = StdIn.readLine()
}
}
case Action.Lemmatize => if (!config.files.isEmpty) for (file <- config.files) {
val writer = new PrintWriter(new File(file+".lemmatized"))
val paragraphs = config.processBy match {
Expand All @@ -122,7 +143,8 @@ object LASCommandLineTool {
if (i!=paragraphs.length) {
writer.write("\n")
if (config.processBy == ProcessBy.Paragraph) writer.write("\n")
} }
}
}
writer.close()
} else {
var text = StdIn.readLine()
Expand Down Expand Up @@ -251,8 +273,19 @@ object LASCommandLineTool {
case None => None
}
}

implicit val WordPartWrites = new Writes[HFSTLexicalAnalysisService.Result.WordPart] {

def hyphenate(text: String, locales: Seq[String]): Option[String] = {
(if (locales.length==1) Some(locales(0)) else getBestLang(text, if (locales.isEmpty) compoundlas.getSupportedBaseformLocales.toSeq.map(_.toString) else locales)) match {
case Some(lang) =>
val hyphenated = compoundlas.hyphenate(text, new Locale(lang))
if (locales.isEmpty) Some(Json.toJson(Map("locale" -> lang, "hyphenated" -> hyphenated)).toString())
else Some(hyphenated)
case None => None
}
}


implicit val WordPartWrites = new Writes[HFSTLexicalAnalysisService.Result.WordPart] {
def writes(r : HFSTLexicalAnalysisService.Result.WordPart) : JsValue = {
Json.obj(
"lemma" -> r.getLemma,
Expand Down Expand Up @@ -322,16 +355,16 @@ object LASCommandLineTool {
if (locales.isEmpty) {
val lrResult = Option(LanguageRecognizer.getLanguageAsObject(text)).map(r => Map(r.getLang() -> r.getIndex))
val ldResult = Try(LanguageDetector(text).map(l => Map(l.getLocale.toString -> l.getProbability))).getOrElse(Seq.empty)
val hfstResultTmp = hfstlas.getSupportedAnalyzeLocales.map(lang =>
(lang.toString(),hfstlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val hfstResultTmp = combinedlas.getSupportedAnalyzeLocales.map(lang =>
(lang.toString(),combinedlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val tc = hfstResultTmp.foldRight(0.0) { _._2 + _ }
val hfstResult = hfstResultTmp.map(p => Map(p._1 -> p._2 / tc))
Try(Some((ldResult ++ hfstResult ++ lrResult).groupBy(_.keysIterator.next).mapValues(_.foldRight(0.0) { (p, r) => r + p.valuesIterator.next } / 3.0).maxBy(_._2)._1)).getOrElse(None)
} else {
val lrResult = Option(LanguageRecognizer.getLanguageAsObject(text, locales: _*)).map(r => Map(r.getLang() -> r.getIndex))
val ldResult = Try(LanguageDetector(text).filter(d => locales.contains(d.getLocale.toString)).map(l => Map(l.getLocale.toString -> l.getProbability))).getOrElse(Seq.empty)
val hfstResultTmp = locales.map(new Locale(_)).intersect(hfstlas.getSupportedAnalyzeLocales.toSeq).map(lang =>
(lang.toString(),hfstlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val hfstResultTmp = locales.map(new Locale(_)).intersect(combinedlas.getSupportedAnalyzeLocales.toSeq).map(lang =>
(lang.toString(),combinedlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val tc = hfstResultTmp.foldRight(0.0) { _._2 + _ }
val hfstResult = hfstResultTmp.map(p => Map(p._1 -> p._2 / tc))
Try(Some((ldResult ++ hfstResult ++ lrResult).groupBy(_.keysIterator.next).mapValues(_.foldRight(0.0) { (p, r) => r + p.valuesIterator.next } / 3.0).maxBy(_._2)._1)).getOrElse(None)
Expand All @@ -342,8 +375,8 @@ object LASCommandLineTool {
val ret = if (!locales.isEmpty) {
val lrResult = Option(LanguageRecognizer.getLanguageAsObject(text, locales: _*)).map(r => Map(r.getLang() -> r.getIndex))
val ldResult = Try(LanguageDetector(text).filter(d => locales.contains(d.getLocale.toString)).map(l => Map(l.getLocale.toString -> l.getProbability))).getOrElse(Seq.empty)
val hfstResultTmp = locales.map(new Locale(_)).intersect(hfstlas.getSupportedAnalyzeLocales.toSeq).map(lang =>
(lang.toString(),hfstlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val hfstResultTmp = locales.map(new Locale(_)).intersect(combinedlas.getSupportedAnalyzeLocales.toSeq).map(lang =>
(lang.toString(),combinedlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val tc = hfstResultTmp.foldRight(0.0) { _._2 + _ }
val hfstResult = hfstResultTmp.map(p => Map(p._1 -> p._2 / tc))
val bestGuess = Try(Some((ldResult ++ hfstResult ++ lrResult).groupBy(_.keysIterator.next).mapValues(_.foldRight(0.0) { (p, r) => r + p.valuesIterator.next } / 3.0).maxBy(_._2))).getOrElse(None)
Expand All @@ -354,8 +387,8 @@ object LASCommandLineTool {
} else {
val lrResult = Option(LanguageRecognizer.getLanguageAsObject(text)).map(r => Map(r.getLang() -> r.getIndex))
val ldResult = Try(LanguageDetector(text).map(l => Map(l.getLocale.toString -> l.getProbability))).getOrElse(Seq.empty)
val hfstResultTmp = hfstlas.getSupportedAnalyzeLocales.map(lang =>
(lang.toString(),hfstlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val hfstResultTmp = combinedlas.getSupportedAnalyzeLocales.map(lang =>
(lang.toString(),combinedlas.recognize(text, lang))).filter(_._2.getRate!=0.0).toSeq.sortBy(_._2.getRate).reverse.map(p => (p._1,p._2.getRate*p._2.getRate))
val tc = hfstResultTmp.foldRight(0.0) { _._2 + _ }
val hfstResult = hfstResultTmp.map(p => Map(p._1 -> p._2 / tc))
val bestGuess = Try(Some((ldResult ++ hfstResult ++ lrResult).groupBy(_.keysIterator.next).mapValues(_.foldRight(0.0) { (p, r) => r + p.valuesIterator.next } / 3.0).maxBy(_._2))).getOrElse(None)
Expand Down

0 comments on commit fd7a65b

Please sign in to comment.