diff --git a/.idea/checkstyle-idea.xml b/.idea/checkstyle-idea.xml new file mode 100644 index 000000000..760d82996 --- /dev/null +++ b/.idea/checkstyle-idea.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="CheckStyle-IDEA" serialisationVersion="2"> + <checkstyleVersion>10.9.3</checkstyleVersion> + <scanScope>JavaOnly</scanScope> + <copyLibs>true</copyLibs> + <option name="thirdPartyClasspath" /> + <option name="activeLocationIds" /> + <option name="locations"> + <list> + <ConfigurationLocation id="bundled-sun-checks" type="BUNDLED" scope="All" description="Sun Checks">(bundled)</ConfigurationLocation> + <ConfigurationLocation id="bundled-google-checks" type="BUNDLED" scope="All" description="Google Checks">(bundled)</ConfigurationLocation> + </list> + </option> + </component> +</project> \ No newline at end of file diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 000000000..1339a2826 --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="CompilerConfiguration"> + <annotationProcessing> + <profile name="Maven default annotation processors profile" enabled="true"> + <sourceOutputDir name="target/generated-sources/annotations" /> + <sourceTestOutputDir name="target/generated-test-sources/test-annotations" /> + <outputRelativeToContentRoot value="true" /> + <module name="zemberek-classification" /> + <module name="zemberek-core" /> + <module name="zemberek-lang-id" /> + <module name="zemberek-tokenization" /> + <module name="zemberek-ner" /> + <module name="zemberek-morphology" /> + <module name="zemberek-grpc-server" /> + <module name="zemberek-all" /> + <module name="zemberek-normalization" /> + <module name="zemberek-lm" /> + <module name="zemberek-examples" /> + <module name="zemberek-apps" /> + <module name="zemberek-experiment" /> + </profile> + </annotationProcessing> + </component> +</project> \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 000000000..c3e3257d3 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,33 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="Encoding"> + <file url="file://$PROJECT_DIR$/all/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/all/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/apps/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/apps/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/classification/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/classification/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/core/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/core/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/examples/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/examples/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/experiment/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/experiment/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/grpc/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/grpc/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/lang-id/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/lang-id/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/lm/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/lm/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/morphology/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/morphology/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/ner/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/ner/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/normalization/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/normalization/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/tokenization/src/main/java" charset="UTF-8" /> + <file url="file://$PROJECT_DIR$/tokenization/src/main/resources" charset="UTF-8" /> + </component> +</project> \ No newline at end of file diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml new file mode 100644 index 000000000..c3ddb503b --- /dev/null +++ b/.idea/jarRepositories.xml @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="RemoteRepositoriesConfiguration"> + <remote-repository> + <option name="id" value="central" /> + <option name="name" value="Central Repository" /> + <option name="url" value="https://repo.maven.apache.org/maven2" /> + </remote-repository> + <remote-repository> + <option name="id" value="maven2-repository.dev.java.net" /> + <option name="name" value="Java.net Repository for Maven" /> + <option name="url" value="http://download.java.net/maven/2/" /> + </remote-repository> + <remote-repository> + <option name="id" value="central" /> + <option name="name" value="Maven Central repository" /> + <option name="url" value="https://repo1.maven.org/maven2" /> + </remote-repository> + <remote-repository> + <option name="id" value="jboss.community" /> + <option name="name" value="JBoss Community repository" /> + <option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" /> + </remote-repository> + </component> +</project> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 000000000..9b865bc27 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,13 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ExternalStorageConfigurationManager" enabled="true" /> + <component name="MavenProjectsManager"> + <option name="originalFiles"> + <list> + <option value="$PROJECT_DIR$/pom.xml" /> + <option value="$PROJECT_DIR$/apps/pom.xml" /> + </list> + </option> + </component> + <component name="ProjectRootManager" version="2" languageLevel="JDK_19" default="true" project-jdk-name="19" project-jdk-type="JavaSDK" /> +</project> \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..94a25f7f4 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="$PROJECT_DIR$" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 000000000..f182fe2e6 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="AutoImportSettings"> + <option name="autoReloadType" value="SELECTIVE" /> + </component> + <component name="ChangeListManager"> + <list default="true" id="b88d3d1a-1464-4322-aa80-02a04e022dec" name="Changes" comment="Update README.md"> + <changelist_data name="Ahmet A. Akın" email="ahmetaa@gmail.com" /> + </list> + <option name="SHOW_DIALOG" value="false" /> + <option name="HIGHLIGHT_CONFLICTS" value="true" /> + <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> + <option name="LAST_RESOLUTION" value="IGNORE" /> + </component> + <component name="Git.Settings"> + <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" /> + </component> + <component name="MarkdownSettingsMigration"> + <option name="stateVersion" value="1" /> + </component> + <component name="MavenImportPreferences"> + <option name="generalSettings"> + <MavenGeneralSettings> + <option name="useMavenConfig" value="true" /> + </MavenGeneralSettings> + </option> + </component> + <component name="ProjectId" id="2O3avZ8RNqunFIPlnlOOevlaLBk" /> + <component name="ProjectViewState"> + <option name="hideEmptyMiddlePackages" value="true" /> + <option name="showLibraryContents" value="true" /> + </component> + <component name="PropertiesComponent"><![CDATA[{ + "keyToString": { + "RunOnceActivity.OpenProjectViewOnStart": "true", + "RunOnceActivity.ShowReadmeOnStart": "true", + "WebServerToolWindowFactoryState": "false", + "nodejs_package_manager_path": "npm" + } +}]]></component> + <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" /> + <component name="TaskManager"> + <task active="true" id="Default" summary="Default task"> + <changelist id="b88d3d1a-1464-4322-aa80-02a04e022dec" name="Changes" comment="" /> + <created>1680791954209</created> + <option name="number" value="Default" /> + <option name="presentableId" value="Default" /> + <updated>1680791954209</updated> + <workItem from="1680791961337" duration="1290000" /> + </task> + <servers /> + </component> + <component name="TypeScriptGeneratedFilesManager"> + <option name="version" value="3" /> + </component> + <component name="Vcs.Log.Tabs.Properties"> + <option name="TAB_STATES"> + <map> + <entry key="MAIN"> + <value> + <State /> + </value> + </entry> + </map> + </option> + </component> + <component name="VcsManagerConfiguration"> + <option name="LAST_COMMIT_MESSAGE" value="" /> + </component> +</project> \ No newline at end of file diff --git a/apps/src/main/java/zemberek/apps/ApplicationRunner.java b/apps/src/main/java/zemberek/apps/ApplicationRunner.java index e637f19b2..59cc10ea9 100644 --- a/apps/src/main/java/zemberek/apps/ApplicationRunner.java +++ b/apps/src/main/java/zemberek/apps/ApplicationRunner.java @@ -39,7 +39,8 @@ private static void listApplications(List<ConsoleApp> apps) { String simpleName = app.getClass().getSimpleName(); System.out.println(simpleName); System.out.println(Strings.repeat("-", simpleName.length())); - String wrapped = wrap(app.description(), 80); + int wrappedDescriptionLengthLimit = 80; + String wrapped = wrap(app.description(), wrappedDescriptionLengthLimit); System.out.println(wrapped); System.out.println(); } diff --git a/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java b/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java index 2cb356b0f..ca73db011 100644 --- a/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java +++ b/apps/src/main/java/zemberek/apps/fasttext/ClassificationConsole.java @@ -119,15 +119,9 @@ private String replaceWordsWithLemma(String sentence) { private String removeNonWords(String sentence) { List<Token> docTokens = TurkishTokenizer.DEFAULT.tokenize(sentence); List<String> reduced = new ArrayList<>(docTokens.size()); + for (Token token : docTokens) { - if ( - token.getType() == Type.PercentNumeral || - token.getType() == Type.Number || - token.getType() == Type.Punctuation || - token.getType() == Type.RomanNumeral || - token.getType() == Type.Time || - token.getType() == Type.UnknownWord || - token.getType() == Type.Unknown) { + if (isTokenNonWord(token)) { if (!token.getText().contains("__")) { continue; } @@ -138,6 +132,16 @@ private String removeNonWords(String sentence) { return String.join(" ", reduced); } + private Boolean isTokenNonWord(Token token) { + return token.getType() == Type.PercentNumeral || + token.getType() == Type.Number || + token.getType() == Type.Punctuation || + token.getType() == Type.RomanNumeral || + token.getType() == Type.Time || + token.getType() == Type.UnknownWord || + token.getType() == Type.Unknown; + } + public static void main(String[] args) { new ClassificationConsole().execute(args); } diff --git a/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes b/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes new file mode 100644 index 000000000..b49b0e805 --- /dev/null +++ b/core/target/classes/zemberek/core/syllable/accepted-syllable-prefixes @@ -0,0 +1,62 @@ +bl +br +ch +cl +cr +cy +dj +dr +dz +fl +fr +gh +gl +gr +gy +hr +hy +kh +kl +kn +kr +ks +ky +ll +ly +mb +mc +mn +my +ph +pl +pn +pr +ps +pt +rh +sc +sf +sh +sk +sl +sm +sn +sp +sr +st +sv +sw +sy +şl +şn +şv +th +tr +ts +tw +ty +vl +wh +zh +zl +zw diff --git a/core/target/classes/zemberek/core/text/html-char-map-common.txt b/core/target/classes/zemberek/core/text/html-char-map-common.txt new file mode 100644 index 000000000..e26c6eb14 --- /dev/null +++ b/core/target/classes/zemberek/core/text/html-char-map-common.txt @@ -0,0 +1,168 @@ +#287:ğ +#286:Ğ +#304:İ +#305:ı +#351:ş +#350:Ş +quot:" +#34:" +amp:& +#38:& +apos:' +#39:' +lt:< +#60:< +gt:> +#62:> +nbsp: +#160: +cent:¢ +#162:¢ +pound:£ +#163:£ +acute:´ +#180:´ +cedil:¸ +#184:¸ +raquo:» +#187:» +laquo:« +#171:« +Agrave:À +#192:À +Aacute:Á +#193:Á +Acirc: +#194: +Atilde:à +#195:à +Auml:Ä +#196:Ä +Aring:Å +#197:Å +Ccedil:Ç +#199:Ç +Egrave:È +#200:È +Eacute:É +#201:É +Ecirc:Ê +#202:Ê +Euml:Ë +#203:Ë +Igrave:Ì +#204:Ì +Iacute:Í +#205:Í +Icirc:Î +#206:Î +Iuml:Ï +#207:Ï +Ntilde:Ñ +#209:Ñ +Ograve:Ò +#210:Ò +Oacute:Ó +#211:Ó +Ocirc:Ô +#212:Ô +Otilde:Õ +#213:Õ +Ouml:Ö +#214:Ö +Ugrave:Ù +#217:Ù +Uacute:Ú +#218:Ú +Ucirc:Û +#219:Û +Uuml:Ü +#220:Ü +Yacute:Ý +#221:Ý +THORN:Þ +#222:Þ +szlig:ß +#223:ß +agrave:à +#224:à +aacute:á +#225:á +acirc:â +#226:â +atilde:ã +#227:ã +auml:ä +#228:ä +aring:å +#229:å +aelig:æ +#230:æ +ccedil:ç +#231:ç +egrave:è +#232:è +eacute:é +#233:é +ecirc:ê +#234:ê +euml:ë +#235:ë +igrave:ì +#236:ì +iacute:í +#237:í +icirc:î +#238:î +iuml:ï +#239:ï +eth:ð +#240:ð +ntilde:ñ +#241:ñ +ograve:ò +#242:ò +oacute:ó +#243:ó +ocirc:ô +#244:ô +otilde:õ +#245:õ +ouml:ö +#246:ö +ugrave:ù +#249:ù +uacute:ú +#250:ú +ucirc:û +#251:û +uuml:ü +#252:ü +lsquo:‘ +#8216:‘ +rsquo:’ +#8217:’ +sbquo:‚ +#8218:‚ +ldquo:“ +#8220:“ +rdquo:” +#8221:” +bdquo:„ +#8222:„ +hellip:… +#8230:… +prime:′ +#8242:′ +Prime:″ +#8243:″ +lsaquo:‹ +#8249:‹ +rsaquo:› +#8250:› +oline:‾ +#8254:‾ +frasl:⁄ +#8260:⁄ +euro:€ +#8364:€ \ No newline at end of file diff --git a/core/target/classes/zemberek/core/text/html-char-map-full.txt b/core/target/classes/zemberek/core/text/html-char-map-full.txt new file mode 100644 index 000000000..1ebbcbe0f --- /dev/null +++ b/core/target/classes/zemberek/core/text/html-char-map-full.txt @@ -0,0 +1,517 @@ +#287:ğ +#286:Ğ +#304:İ +#305:ı +#351:ş +#350:Ş +#145:' +#146:' +#147:" +#148:" +#151:- +am:& +#38:& +apos:' +#39:' +quot:" +#34:" +lt:< +#60:< +gt:> +#62:> +nbsp: +#160: +iexcl:¡ +#161:¡ +cent:¢ +#162:¢ +pound:£ +#163:£ +curren:¤ +#164:¤ +yen:¥ +#165:¥ +brvbar:¦ +#166:¦ +sect:§ +#167:§ +uml:¨ +#168:¨ +copy:© +#169:© +ordf:ª +#170:ª +laquo:« +#171:« +not:¬ +#172:¬ +shy: +#173: +reg:® +#174:® +macr:¯ +#175:¯ +deg:° +#176:° +plusmn:± +#177:± +sup2:² +#178:² +sup3:³ +#179:³ +acute:´ +#180:´ +micro:µ +#181:µ +para:¶ +#182:¶ +middot:· +#183:· +cedil:¸ +#184:¸ +sup1:¹ +#185:¹ +ordm:º +#186:º +raquo:» +#187:» +frac14:¼ +#188:¼ +frac12:½ +#189:½ +frac34:¾ +#190:¾ +iquest:¿ +#191:¿ +Agrave:À +#192:À +Aacute:Á +#193:Á +Acirc: +#194: +Atilde:à +#195:à +Auml:Ä +#196:Ä +Aring:Å +#197:Å +AElig:Æ +#198:Æ +Ccedil:Ç +#199:Ç +Egrave:È +#200:È +Eacute:É +#201:É +Ecirc:Ê +#202:Ê +Euml:Ë +#203:Ë +Igrave:Ì +#204:Ì +Iacute:Í +#205:Í +Icirc:Î +#206:Î +Iuml:Ï +#207:Ï +ETH:Ð +#208:Ð +Ntilde:Ñ +#209:Ñ +Ograve:Ò +#210:Ò +Oacute:Ó +#211:Ó +Ocirc:Ô +#212:Ô +Otilde:Õ +#213:Õ +Ouml:Ö +#214:Ö +times:× +#215:× +Oslash:Ø +#216:Ø +Ugrave:Ù +#217:Ù +Uacute:Ú +#218:Ú +Ucirc:Û +#219:Û +Uuml:Ü +#220:Ü +Yacute:Ý +#221:Ý +THORN:Þ +#222:Þ +szlig:ß +#223:ß +agrave:à +#224:à +aacute:á +#225:á +acirc:â +#226:â +atilde:ã +#227:ã +auml:ä +#228:ä +aring:å +#229:å +aelig:æ +#230:æ +ccedil:ç +#231:ç +egrave:è +#232:è +eacute:é +#233:é +ecirc:ê +#234:ê +euml:ë +#235:ë +igrave:ì +#236:ì +iacute:í +#237:í +icirc:î +#238:î +iuml:ï +#239:ï +eth:ð +#240:ð +ntilde:ñ +#241:ñ +ograve:ò +#242:ò +oacute:ó +#243:ó +ocirc:ô +#244:ô +otilde:õ +#245:õ +ouml:ö +#246:ö +divide:÷ +#247:÷ +oslash:ø +#248:ø +ugrave:ù +#249:ù +uacute:ú +#250:ú +ucirc:û +#251:û +uuml:ü +#252:ü +yacute:ý +#253:ý +thorn:þ +#254:þ +yuml:ÿ +#255:ÿ +OElig:Œ +#338:Œ +oelig:œ +#339:œ +Scaron:Š +#352:Š +scaron:š +#353:š +Yuml:Ÿ +#376:Ÿ +fnof:ƒ +#402:ƒ +circ:ˆ +#710:ˆ +tilde:˜ +#732:˜ +Alpha:Α +#913:Α +Beta:Β +#914:Β +Gamma:Γ +#915:Γ +Delta:Δ +#916:Δ +Epsilon:Ε +#917:Ε +Zeta:Ζ +#918:Ζ +Eta:Η +#919:Η +Theta:Θ +#920:Θ +Iota:Ι +#921:Ι +Kappa:Κ +#922:Κ +Lambda:Λ +#923:Λ +Mu:Μ +#924:Μ +Nu:Ν +#925:Ν +Xi:Ξ +#926:Ξ +Omicron:Ο +#927:Ο +Pi:Π +#928:Π +Rho:Ρ +#929:Ρ +Sigma:Σ +#931:Σ +Tau:Τ +#932:Τ +Upsilon:Υ +#933:Υ +Phi:Φ +#934:Φ +Chi:Χ +#935:Χ +Psi:Ψ +#936:Ψ +Omega:Ω +#937:Ω +alpha:α +#945:α +beta:β +#946:β +gamma:γ +#947:γ +delta:δ +#948:δ +epsilon:ε +#949:ε +zeta:ζ +#950:ζ +eta:η +#951:η +theta:θ +#952:θ +iota:ι +#953:ι +kappa:κ +#954:κ +lambda:λ +#955:λ +mu:μ +#956:μ +nu:ν +#957:ν +xi:ξ +#958:ξ +omicron:ο +#959:ο +pi:π +#960:π +rho:ρ +#961:ρ +sigmaf:ς +#962:ς +sigma:σ +#963:σ +tau:τ +#964:τ +upsilon:υ +#965:υ +phi:φ +#966:φ +chi:χ +#967:χ +psi:ψ +#968:ψ +omega:ω +#969:ω +thetasym:ϑ +#977:ϑ +upsih:ϒ +#978:ϒ +piv:ϖ +#982:ϖ +ensp: +#8194: +emsp: +#8195: +thinsp: +#8201: +#zwnj: +#8204: +#zwj: +#8205: +#lrm: +#8206: +#rlm: +#8207: +ndash:– +#8211:– +mdash:— +#8212:— +lsquo:‘ +#8216:‘ +rsquo:’ +#8217:’ +sbquo:‚ +#8218:‚ +ldquo:“ +#8220:“ +rdquo:” +#8221:” +bdquo:„ +#8222:„ +dagger:† +#8224:† +Dagger:‡ +#8225:‡ +bull:• +#8226:• +hellip:… +#8230:… +permil:‰ +#8240:‰ +prime:′ +#8242:′ +Prime:″ +#8243:″ +lsaquo:‹ +#8249:‹ +rsaquo:› +#8250:› +oline:‾ +#8254:‾ +frasl:⁄ +#8260:⁄ +euro:€ +#8364:€ +image:ℑ +#8465:ℑ +weierp:℘ +#8472:℘ +real:ℜ +#8476:ℜ +trade:™ +#8482:™ +alefsym:ℵ +#8501:ℵ +larr:← +#8592:← +uarr:↑ +#8593:↑ +rarr:→ +#8594:→ +darr:↓ +#8595:↓ +harr:↔ +#8596:↔ +crarr:↵ +#8629:↵ +lArr:⇐ +#8656:⇐ +uArr:⇑ +#8657:⇑ +rArr:⇒ +#8658:⇒ +dArr:⇓ +#8659:⇓ +hArr:⇔ +#8660:⇔ +forall:∀ +#8704:∀ +part:∂ +#8706:∂ +exist:∃ +#8707:∃ +empty:∅ +#8709:∅ +nabla:∇ +#8711:∇ +isin:∈ +#8712:∈ +notin:∉ +#8713:∉ +ni:∋ +#8715:∋ +prod:∏ +#8719:∏ +sum:∑ +#8721:∑ +minus:− +#8722:− +lowast:∗ +#8727:∗ +radic:√ +#8730:√ +prop:∝ +#8733:∝ +infin:∞ +#8734:∞ +ang:∠ +#8736:∠ +and:∧ +#8743:∧ +or:∨ +#8744:∨ +cap:∩ +#8745:∩ +cup:∪ +#8746:∪ +int:∫ +#8747:∫ +there4:∴ +#8756:∴ +sim:∼ +#8764:∼ +cong:≅ +#8773:≅ +asymp:≈ +#8776:≈ +ne:≠ +#8800:≠ +equiv:≡ +#8801:≡ +le:≤ +#8804:≤ +ge:≥ +#8805:≥ +sub:⊂ +#8834:⊂ +sup:⊃ +#8835:⊃ +nsub:⊄ +#8836:⊄ +sube:⊆ +#8838:⊆ +supe:⊇ +#8839:⊇ +oplus:⊕ +#8853:⊕ +otimes:⊗ +#8855:⊗ +perp:⊥ +#8869:⊥ +sdot:⋅ +#8901:⋅ +lceil:⌈ +#8968:⌈ +rceil:⌉ +#8969:⌉ +lfloor:⌊ +#8970:⌊ +rfloor:⌋ +#8971:⌋ +lang:〈 +#9001:〈 +rang:〉 +#9002:〉 +loz:◊ +#9674:◊ +spades:♠ +#9824:♠ +clubs:♣ +#9827:♣ +hearts:♥ +#9829:♥ +diams:♦ +#9830:♦ \ No newline at end of file diff --git a/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt b/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt new file mode 100644 index 000000000..7d3c61fb9 --- /dev/null +++ b/core/target/classes/zemberek/core/text/special-char-to-simple-char.txt @@ -0,0 +1,54 @@ +À:A +Á:A +Â:A +Ã:A +Ä:A +Å:A +È:E +É:E +Ê:E +Ë:E +Ì:I +Í:I +Î:İ +Ï:I +Ñ:N +Ò:O +Ó:O +Ô:O +Õ:O +Ù:U +Ú:U +Û:U +à:a +á:a +â:a +ã:a +ä:a +å:a +è:e +é:e +ê:e +ë:e +ì:i +í:i +î:i +ï:i +ñ:n +ò:o +ó:o +ô:o +õ:o +ù:u +ú:u +û:u +‘:' +’:' +“:" +”:" +…:... +′:' +″:" +´:' +»:" +«:" \ No newline at end of file diff --git a/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java b/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java index 6c9613f48..28a1692c9 100644 --- a/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java +++ b/normalization/src/main/java/zemberek/normalization/SingleWordSpellChecker.java @@ -524,10 +524,10 @@ public boolean equals(Object o) { @Override public int hashCode() { int result; - long temp; + long penaltyAsLongVariable; result = node.hashCode(); - temp = Double.doubleToLongBits(penalty); - result = 31 * result + (int) (temp ^ (temp >>> 32)); + penaltyAsLongVariable = Double.doubleToLongBits(penalty); + result = 31 * result + (int) (penaltyAsLongVariable ^ (penaltyAsLongVariable >>> 32)); result = 31 * result + index; return result; }