diff --git a/govarnam-rust/example/src/main.rs b/govarnam-rust/example/src/main.rs index ec805c5..eb51ce6 100644 --- a/govarnam-rust/example/src/main.rs +++ b/govarnam-rust/example/src/main.rs @@ -19,22 +19,24 @@ static VARNAM: Lazy = Lazy::new(|| { }); fn main() { - let mut matches: Vec<(String, String)> = Vec::with_capacity(20); - - let results = VARNAM.transliterate("namaskkaaram"); + // for _ in 0..50 { + let mut matches: Vec<(String, String)> = Vec::with_capacity(20); + + let results = VARNAM.transliterate("namaskkaaram"); + + // for item in results { + // println!( + // "Word: {}, Weight: {}, Learned on: {}", + // item.to_string(), + // item.weight, + // item.learned_on, + // ); + // } + + for result in results { + matches.push(("input".into(), result.to_string())) + } - // for item in results { - // println!( - // "Word: {}, Weight: {}, Learned on: {}", - // item.to_string(), - // item.weight, - // item.learned_on, - // ); + eprintln!("{:?}", matches); // } - - for result in results { - matches.push(("input".into(), result.to_string())) - } - - eprintln!("{:?}", matches); } \ No newline at end of file diff --git a/govarnam-rust/src/rvarnam.rs b/govarnam-rust/src/rvarnam.rs index 3fba1bb..7d9e2a2 100644 --- a/govarnam-rust/src/rvarnam.rs +++ b/govarnam-rust/src/rvarnam.rs @@ -47,26 +47,34 @@ impl Varnam { let vst_file = vst_file.as_ref().to_string_lossy().to_string(); let learning_file = learning_file.as_ref().to_string_lossy().to_string(); unsafe { - let _init_id = varnam_init( + let init_id = varnam_init( vst_file.as_ptr() as *const i8, learning_file.as_ptr() as *const i8, &id, ); + + while init_id != std::ptr::null() { + return Self::init(vst_file, learning_file); + } }; - // TODO: check error use init_id + Ok(Varnam { handle_id: id }) } pub fn transliterate>(&self, word: T) -> Vec { let id: c_int = 1; - let word = CString::new(word.as_ref()).unwrap(); + let c_word = CString::new(word.as_ref()).unwrap(); let mut varray_ptr = varray_t::init(); - unsafe { varnam_transliterate(self.handle_id, id, word.as_ptr(), &mut varray_ptr) }; + let trans_id = unsafe { varnam_transliterate(self.handle_id, id, c_word.as_ptr(), &mut varray_ptr) }; + while trans_id != std::ptr::null() { + return self.transliterate(word); + } let varray_pointer = unsafe { *varray_ptr as varray_t }; varray_pointer.into() } } + impl Drop for Varnam { fn drop(&mut self) { unsafe { varnam_close(self.handle_id) } diff --git a/govarnam/.gitignore b/govarnam/.gitignore index ac2e8fb..78ed586 100644 --- a/govarnam/.gitignore +++ b/govarnam/.gitignore @@ -6,3 +6,4 @@ install.sh govarnam.pc *.vst +a diff --git a/govarnam/Makefile b/govarnam/Makefile index 33400fb..201b3b6 100644 --- a/govarnam/Makefile +++ b/govarnam/Makefile @@ -22,15 +22,11 @@ CURDIR := $(shell pwd) ifeq ($(UNAME), Darwin) SED := sed -i "" LIB_NAME = libgovarnam.dylib -else ifeq ($(UNAME), Windows_NT) - INSTALL_DIR = "C:\\lib" - LIB_NAME = libgovarnam.dll else EXT_LDFLAGS = -extldflags "-Wl,-soname,$(LIB_NAME).$(SO_NAME),--version-script,$(CURDIR)/govarnam.syms" endif VERSION_STAMP_LDFLAGS := -X 'github.com/varnamproject/govarnam/govarnam.BuildString=${BUILDSTR}' -X 'github.com/varnamproject/govarnam/govarnam.VersionString=${VERSION}' $(EXT_LDFLAGS) - pc: cp govarnam.pc.in govarnam.pc ${SED} "s#@INSTALL_PREFIX@#${INSTALL_PREFIX}#g" govarnam.pc @@ -71,20 +67,8 @@ library-nosqlite: CGO_ENABLED=1 go build -tags "fts5,libsqlite3" -buildmode=c-shared -ldflags "-s -w ${VERSION_STAMP_LDFLAGS}" -o ${LIB_NAME} . library: - ifeq ($(UNAME), Windows_NT) - CGO_CFLAGS=-Dvarnam_EXPORTS CGO_ENABLED=1 go build -tags "fts5" -buildmode=c-shared -ldflags "-s -w ${VERSION_STAMP_LDFLAGS}" -o ${LIB_NAME} . - if not exist "${INSTALL_DIR}" ( - mkdir "${INSTALL_DIR}" - ) - copy ${LIB_NAME} "${INSTALL_DIR}" - echo %PATH% | findstr /C:"${INSTALL_DIR}" > nul - if errorlevel 1 ( - setx PATH "%PATH%;${INSTALL_DIR}" - ) - else - CGO_ENABLED=1 go build -tags "fts5" -buildmode=c-shared -ldflags "-s -w ${VERSION_STAMP_LDFLAGS}" -o ${LIB_NAME} . - ln -sf "$(realpath ./)/libgovarnam.so" "$(realpath ./)/libgovarnam.so.${SO_NAME}" - endif + CGO_ENABLED=1 go build -tags "fts5" -buildmode=c-shared -ldflags "-s -w ${VERSION_STAMP_LDFLAGS}" -o ${LIB_NAME} . + ln -sf "$(realpath ./)/libgovarnam.so" "$(realpath ./)/libgovarnam.so.${SO_NAME}" library-mac-universal: GOOS=darwin GOARCH=arm64 $(MAKE) library diff --git a/govarnam/c-shared.go b/govarnam/c-shared.go index 9996cbb..843fb9a 100644 --- a/govarnam/c-shared.go +++ b/govarnam/c-shared.go @@ -109,8 +109,7 @@ func makeCTransliterationResult(ctx context.Context, goResult govarnam.Translite //export varnam_get_version func varnam_get_version() *C.char { - version := govarnam.VersionString - return C.CString(version) + return C.CString(govarnam.VersionString) } //export varnam_get_build @@ -273,35 +272,30 @@ func varnam_debug(varnamHandleID C.int, val C.int) { } // Deprecated. Use varnam_config() -// //export varnam_set_indic_digits func varnam_set_indic_digits(varnamHandleID C.int, val C.int) { varnam_config(varnamHandleID, C.VARNAM_CONFIG_USE_INDIC_DIGITS, val) } // Deprecated. Use varnam_config() -// //export varnam_set_dictionary_suggestions_limit func varnam_set_dictionary_suggestions_limit(varnamHandleID C.int, val C.int) { getVarnamHandle(varnamHandleID).varnam.DictionarySuggestionsLimit = int(val) } // Deprecated. Use varnam_config() -// //export varnam_set_pattern_dictionary_suggestions_limit func varnam_set_pattern_dictionary_suggestions_limit(varnamHandleID C.int, val C.int) { getVarnamHandle(varnamHandleID).varnam.PatternDictionarySuggestionsLimit = int(val) } // Deprecated. Use varnam_config() -// //export varnam_set_tokenizer_suggestions_limit func varnam_set_tokenizer_suggestions_limit(varnamHandleID C.int, val C.int) { getVarnamHandle(varnamHandleID).varnam.TokenizerSuggestionsLimit = int(val) } // Deprecated. Use varnam_config() -// //export varnam_set_dictionary_match_exact func varnam_set_dictionary_match_exact(varnamHandleID C.int, val C.int) { if val == 0 { diff --git a/govarnam/govarnam/channel.go b/govarnam/govarnam/channel.go index 59b35c2..9d4e855 100644 --- a/govarnam/govarnam/channel.go +++ b/govarnam/govarnam/channel.go @@ -128,7 +128,7 @@ func (varnam *Varnam) channelGetFromDictionary(ctx context.Context, word string, if len(dictResult.partialMatches) > 0 { // Tokenize the word after the longest match found in dictionary - restOfWord := word[dictResult.longestMatchPosition+1:] + restOfWord := string([]rune(word)[dictResult.longestMatchPosition+1:]) start := time.Now() diff --git a/govarnam/govarnam/constants.go b/govarnam/govarnam/constants.go index 01b0258..5a47448 100644 --- a/govarnam/govarnam/constants.go +++ b/govarnam/govarnam/constants.go @@ -98,7 +98,7 @@ func getVSTLookupDirs() []string { } } -//FindVSTDir Get the VST storing directory +// FindVSTDir Get the VST storing directory func FindVSTDir() (string, error) { for _, loc := range getVSTLookupDirs() { if dirExists(loc) { diff --git a/govarnam/govarnam/dictionary.go b/govarnam/govarnam/dictionary.go index 16b06e7..0959a80 100644 --- a/govarnam/govarnam/dictionary.go +++ b/govarnam/govarnam/dictionary.go @@ -57,7 +57,7 @@ type searchDictionaryResult struct { func (varnam *Varnam) InitDict(dictPath string) error { var err error - if dictPath == "" || !fileExists(dictPath) { + if !fileExists(dictPath) { log.Printf("Making Varnam Learnings Dir for %s\n", dictPath) err := os.MkdirAll(path.Dir(dictPath), 0750) if err != nil { diff --git a/govarnam/govarnam/govarnam.go b/govarnam/govarnam/govarnam.go index 2928341..2d50876 100644 --- a/govarnam/govarnam/govarnam.go +++ b/govarnam/govarnam/govarnam.go @@ -14,6 +14,7 @@ import ( "sort" "strings" "time" + "unicode" "unicode/utf8" // sqlite3 @@ -25,6 +26,7 @@ type LangRules struct { Virama string IndicDigits bool PatternLongestLength int // Longest length of pattern in VST + UnicodeBlock unicode.RangeTable } // SchemeDetails of VST @@ -247,8 +249,8 @@ func (varnam *Varnam) setDefaultConfig() { varnam.DictionaryMatchExact = false varnam.LangRules.IndicDigits = false - varnam.LangRules.Virama, _ = varnam.getVirama() + varnam.LangRules.UnicodeBlock = varnam.getUnicodeBlock() if varnam.SchemeDetails.LangCode == "ml" { varnam.RegisterPatternWordPartializer(varnam.mlPatternWordPartializer) diff --git a/govarnam/govarnam/govarnam_ml_test.go b/govarnam/govarnam/govarnam_ml_test.go index 877de53..d6c1370 100644 --- a/govarnam/govarnam/govarnam_ml_test.go +++ b/govarnam/govarnam/govarnam_ml_test.go @@ -478,6 +478,7 @@ func TestMLRecentlyLearnedWords(t *testing.T) { } result, err = varnam.GetRecentlyLearntWords(context.Background(), 4, len(words)) + checkError(err) assertEqual(t, result[0].Word, "ആലപ്പുഴ") } @@ -495,3 +496,16 @@ func TestMLGetSuggestions(t *testing.T) { assertEqual(t, result[0].Word, "ആലപ്പുഴ") } + +func TestMLNativePartialWordsInInput(t *testing.T) { + varnam := getVarnamInstance("ml") + + words := []string{"ആലപ്പുഴ", "പുസ്തകം"} + for _, word := range words { + varnam.Learn(word, 0) + } + + assertEqual(t, varnam.TransliterateAdvanced("ആലppu").DictionarySuggestions[0].Word, "ആലപ്പുഴ") + assertEqual(t, varnam.TransliterateAdvanced("puസ്ത").DictionarySuggestions[0].Word, "പുസ്തകം") + assertEqual(t, varnam.TransliterateAdvanced("ആലippazham").DictionarySuggestions[0].Word, "ആലിപ്പഴം") +} diff --git a/govarnam/govarnam/govarnam_ml.go b/govarnam/govarnam/lang_specific_rules.go similarity index 69% rename from govarnam/govarnam/govarnam_ml.go rename to govarnam/govarnam/lang_specific_rules.go index 632a4d5..6bf0f61 100644 --- a/govarnam/govarnam/govarnam_ml.go +++ b/govarnam/govarnam/lang_specific_rules.go @@ -1,5 +1,7 @@ package govarnam +import "unicode" + /** * govarnam - An Indian language transliteration library * Copyright Subin Siby , 2021 @@ -23,3 +25,15 @@ func (varnam *Varnam) mlPatternWordPartializer(sug *Suggestion) { sug.Word = sug.Word[0:len(sug.Word)-size] + "മ" } } + +func (varnam *Varnam) getUnicodeBlock() unicode.RangeTable { + switch varnam.SchemeDetails.LangCode { + case "kn": + return unicode.RangeTable{R16: []unicode.Range16{{0x0C80, 0x0CFF, 1}}} + case "ml": + return unicode.RangeTable{R16: []unicode.Range16{{0x0D00, 0x0D7F, 1}}} + default: + return unicode.RangeTable{} + } + // TODO add for all languages +} diff --git a/govarnam/govarnam/symbol.go b/govarnam/govarnam/symbol.go index a02d617..3f4cd05 100644 --- a/govarnam/govarnam/symbol.go +++ b/govarnam/govarnam/symbol.go @@ -12,6 +12,7 @@ import ( "fmt" "log" "strings" + "unicode" "github.com/mattn/go-sqlite3" ) @@ -40,16 +41,18 @@ type Token struct { character string // Non language character } +var sqlite3WithLimitDriverRegistered bool var sqlite3Conn *sqlite3.SQLiteConn func openDB(path string) (*sql.DB, error) { - if sqlite3Conn == nil { + if !sqlite3WithLimitDriverRegistered { sql.Register("sqlite3_with_limit", &sqlite3.SQLiteDriver{ ConnectHook: func(conn *sqlite3.SQLiteConn) error { sqlite3Conn = conn return nil }, }) + sqlite3WithLimitDriverRegistered = true } conn, err := sql.Open("sqlite3_with_limit", path) @@ -274,10 +277,17 @@ func (varnam *Varnam) tokenizeWord(ctx context.Context, word string, matchType i matches := varnam.findLongestPatternMatchSymbols(ctx, sequence, matchType, acceptCondition) if len(matches) == 0 { - // No matches, add a character token - // Note that we just add 1 character, and move on - token := Token{VARNAM_TOKEN_CHAR, matches, i, string(sequence[:1])} - results = append(results, token) + if unicode.In(sequence[0], &varnam.LangRules.UnicodeBlock) { + // This helps to get suggestions in inputs like "ആലppu" + character := string(sequence[0]) + token := Token{VARNAM_TOKEN_SYMBOL, []Symbol{{Value1: character}}, i, character} + results = append(results, token) + } else { + // No matches, add a character token + // Note that we just add 1 character, and move on + token := Token{VARNAM_TOKEN_CHAR, matches, i, string(sequence[:1])} + results = append(results, token) + } i++ } else { diff --git a/govarnam/govarnamgo/govarnamgo_ml_test.go b/govarnam/govarnamgo/govarnamgo_ml_test.go index de22b04..cd8c44e 100644 --- a/govarnam/govarnamgo/govarnamgo_ml_test.go +++ b/govarnam/govarnamgo/govarnamgo_ml_test.go @@ -91,7 +91,7 @@ func TestRecentlyLearnedWords(t *testing.T) { func TestSearchSymbolTable(t *testing.T) { varnam := getVarnamInstance("ml") - symbol := varnam.NewSearchSymbol() + symbol := NewSearchSymbol() symbol.Pattern = "la" result := varnam.SearchSymbolTable(context.Background(), symbol) diff --git a/govarnam/govarnamgo/govarnamgo_test.go b/govarnam/govarnamgo/govarnamgo_test.go index edfd778..fb32587 100644 --- a/govarnam/govarnamgo/govarnamgo_test.go +++ b/govarnam/govarnamgo/govarnamgo_test.go @@ -32,7 +32,7 @@ func assertEqual(t *testing.T, a interface{}, b interface{}) { } func setUp(schemeID string) { - varnam, err := VarnamInitFromID(schemeID) + varnam, err := InitFromID(schemeID) checkError(err) mutex.Lock() @@ -67,7 +67,7 @@ func tearDown() { func TestMain(m *testing.M) { var err error - testTempDir, err = os.TempDir("", "govarnam_test") + testTempDir, err = os.MkdirTemp("", "govarnamgo_test") checkError(err) setUp("ml") diff --git a/govarnam/schemes/.DS_Store b/govarnam/schemes/.DS_Store new file mode 100644 index 0000000..f41a348 Binary files /dev/null and b/govarnam/schemes/.DS_Store differ diff --git a/govarnam/schemes/.gitattributes b/govarnam/schemes/.gitattributes new file mode 100644 index 0000000..5784f54 --- /dev/null +++ b/govarnam/schemes/.gitattributes @@ -0,0 +1,2 @@ +*.vlf filter=lfs diff=lfs merge=lfs -text +*.txt filter=lfs diff=lfs merge=lfs -text diff --git a/govarnam/schemes/.github/workflows/build.yml b/govarnam/schemes/.github/workflows/build.yml new file mode 100644 index 0000000..8b315cd --- /dev/null +++ b/govarnam/schemes/.github/workflows/build.yml @@ -0,0 +1,43 @@ +name: Build + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + publish: + name: Build + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + lfs: true + submodules: recursive + + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: 1.16 + + - name: Make GoVarnam + run: | + git clone https://github.com/varnamproject/govarnam.git govarnam + cd govarnam + make + sudo make install + + - name: Dependencies + run: | + sudo apt install ruby-ffi + + - name: Make Schemes + run: | + ./build_all_schemes.sh + + - name: Run Tests + run: | + ruby test/run.rb diff --git a/govarnam/schemes/.github/workflows/release.yml b/govarnam/schemes/.github/workflows/release.yml new file mode 100644 index 0000000..fb14371 --- /dev/null +++ b/govarnam/schemes/.github/workflows/release.yml @@ -0,0 +1,56 @@ +name: Publish + +on: + push: + tags: + - '*' + +jobs: + publish: + name: Publish + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + lfs: true + submodules: recursive + + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: 1.16 + + - name: Make GoVarnam + run: | + git clone https://github.com/varnamproject/govarnam.git govarnam + cd govarnam + make + sudo make install + + - name: Dependencies + run: | + sudo apt install ruby-ffi + + - name: Make Schemes + run: | + ./build_all_schemes.sh + + - name: Build Packs + run: | + sudo ./install_all_schemes.sh + ./build_all_packs.sh + + - name: Make Language Zips + run: | + ./build_zips.sh + + - name: Upload Release Binary + uses: svenstaro/upload-release-action@v2 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: "*.zip" + file_glob: true + tag: ${{ github.ref }} + overwrite: true diff --git a/govarnam/schemes/.gitignore b/govarnam/schemes/.gitignore new file mode 100644 index 0000000..97fa0dd --- /dev/null +++ b/govarnam/schemes/.gitignore @@ -0,0 +1,6 @@ +libvarnam* +*.vst* +*.so +*.zip +*.vlf +install.sh diff --git a/govarnam/schemes/LICENSE b/govarnam/schemes/LICENSE new file mode 100644 index 0000000..a612ad9 --- /dev/null +++ b/govarnam/schemes/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/govarnam/schemes/README.md b/govarnam/schemes/README.md new file mode 100644 index 0000000..9e5269e --- /dev/null +++ b/govarnam/schemes/README.md @@ -0,0 +1,87 @@ +# Language Files + +Varnam Language support files. **DOWNLOAD YOUR LANGUAGE FILES FROM [releases](https://github.com/varnamproject/schemes/releases)**. + +## Installation + +* Download your language support file from [releases](https://github.com/varnamproject/schemes/releases) +* Extract zip +* Open a terminal in your extracted folder +* Run (DO NOT RUN WITH sudo): +``` +./install.sh install +``` +It will ask for your password, enter it. You will also be asked to import words. + +To check if installation is successful, try this command : +```bash +varnamcli -s ml enthaanu +``` +It should give malayalam output if installation is successful. + +## Development + +Folder structure: +- `schemes` + - ... + - `ml` + - `ml.scheme` - Scheme File + - `symbol-frequency-report.txt` - Symbol Frequency Report + - `Other folders` - Different pack folders + - ... +- `install.sh.in` - A placeholder script which will be copied to every scheme folder + +### Scheme + +A scheme file is a mapping of English characters to Indian language characters. This helps in transliteration using a letter by letter conversion. + +The scheme file is compiled to a file called Varnam Symbol Table (VST). Varnam uses VST to do transliteration. **VST IS REQUIRED** for basic language support in Varnam. + +[Read more on scheme](https://www.varnamproject.com/docs/adding-a-new-language) + +### Symbol Frequency Report + +This file is used to populate `weight` column in VST + +File format: + +``` +ക 98 +വ 98 +അ 98 +... +``` + +This file is made using scripts inside `scripts` folder. It has a README. + +### Packs + +A language pack is a set of pre-trained **Varnam Learning Files (VLF)** that can be imported into any Varnam instance quickly. It has many words in it. It's basically a dictionary file to import words from. + +### Compiling A Scheme + +Install dependencies: + +```bash +sudo apt install ruby-ffi +``` + +Compile scheme: + +```bash +./compile-scheme.rb -s schemes/ta/ta.scheme -o schemes/ta/ta.vst +``` + +The compiled scheme will be a SQLite Database with extension ".vst". + +Now link the file to the place where Varnam will look for VST. + +```bash +sudo ln -s $(realpath schemes/ta/ta.vst) /usr/local/share/varnam/schemes/ta.vst +``` + +Now Varnam can use it. Test it out : + +```bash +varnamcli -s ta nandri +``` diff --git a/govarnam/schemes/README.txt.in b/govarnam/schemes/README.txt.in new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/README.txt.in @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/as/README.md b/govarnam/schemes/as/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/as/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/as/import.sh b/govarnam/schemes/as/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/as/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/bn/README.md b/govarnam/schemes/bn/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/bn/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/bn/import.sh b/govarnam/schemes/bn/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/bn/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/build_all_packs.sh b/govarnam/schemes/build_all_packs.sh new file mode 100644 index 0000000..cb2b0b2 --- /dev/null +++ b/govarnam/schemes/build_all_packs.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +schemes=("as" "bn" "gu" "hi" "kn" "ml" "ml-inscript" "mr" "ne" "or" "pa" "sa" "ta" "te") +for schemeID in ${schemes[@]}; do + for packDir in schemes/$schemeID/*/ ; do + if [ -d "$packDir" ]; then + echo "$packDir" + python3 scripts/make-pack.py $schemeID $packDir + fi + done +done diff --git a/govarnam/schemes/build_all_schemes.sh b/govarnam/schemes/build_all_schemes.sh new file mode 100644 index 0000000..2744b8d --- /dev/null +++ b/govarnam/schemes/build_all_schemes.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +schemes=("as" "bn" "gu" "hi" "kn" "ml" "ml-inscript" "mr" "ne" "or" "pa" "sa" "ta" "te") +for f in ${schemes[@]}; do + ./build_scheme.sh $f +done diff --git a/govarnam/schemes/build_scheme.sh b/govarnam/schemes/build_scheme.sh new file mode 100644 index 0000000..e64a3a7 --- /dev/null +++ b/govarnam/schemes/build_scheme.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +f=$1 +vst=schemes/$f/$f.vst +./compile-scheme.rb -s schemes/$f/$f.scheme -o $vst +if [ -f schemes/$f/symbol-frequency-report.txt ]; then + python3 scripts/symbol-weight-update-in-vst.py $vst schemes/$f/symbol-frequency-report.txt + echo "Populated weight column in $vst" +fi diff --git a/govarnam/schemes/build_source_with_lfs_zip.sh b/govarnam/schemes/build_source_with_lfs_zip.sh new file mode 100644 index 0000000..6eef96a --- /dev/null +++ b/govarnam/schemes/build_source_with_lfs_zip.sh @@ -0,0 +1,2 @@ +git lfs fetch +zip -r source-with-lfs.zip . -x '.git' diff --git a/govarnam/schemes/build_zips.sh b/govarnam/schemes/build_zips.sh new file mode 100644 index 0000000..fa49765 --- /dev/null +++ b/govarnam/schemes/build_zips.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +cd schemes + +schemes=("as" "bn" "gu" "hi" "kn" "ml" "ml-inscript" "mr" "ne" "or" "pa" "sa" "ta" "te") +for schemeID in ${schemes[@]}; do + cp ../install.sh.in $schemeID/install.sh + sed -i "s#@INSTALL_PREFIX@#/usr/local#g" $schemeID/install.sh + chmod +x $schemeID/install.sh + + cp ../import.sh.in $schemeID/import.sh + chmod +x $schemeID/import.sh + + cp ../README.txt.in $schemeID/README.md + + zip -r ../$schemeID.zip $schemeID -x '*.scheme' -x '*.txt' -x '*.vst.learnings*' +done \ No newline at end of file diff --git a/govarnam/schemes/compile-scheme.rb b/govarnam/schemes/compile-scheme.rb new file mode 100644 index 0000000..f4c56b0 --- /dev/null +++ b/govarnam/schemes/compile-scheme.rb @@ -0,0 +1,741 @@ +#!/usr/bin/env ruby + +# encoding: utf-8 + +require 'optparse' +require './varnam' + +''' +Compile a scheme file to VST +Requires govarnam +TODO remove dependency on govarnam +''' + +$options = {} + +$custom_lists = {} +$current_custom_list = [] + +# Starts a list context. Any tokens created inside will get added to this list +# It can have multiple list names and token will get added to all of these. One token +# can be in multiple lists +def list(*names, &block) + if not $current_custom_list.empty? + # This happens when user tries to nest list. + # Nesting list is not allowed + error "Can't create nested list" + exit (1) + end + + if names.empty? + error "List should have a name" + exit (1) + end + + names.each do |name| + if not name.is_a?(String) and not name.is_a?(Symbol) + error "List name should be a string or symbols" + exit (1) + end + + $custom_lists[name] = [] if not $custom_lists.has_key?(name) + $current_custom_list << $custom_lists[name] + end + + yield if block_given? +ensure + $current_custom_list = [] +end + +def push_to_current_custom_list(token) + if token.nil? + error "Can't add empty token" + exit (1) + end + + $current_custom_list.each do |l| + l.push(token) + end +end + +# We handle method missing to return appropriate lists +def self.method_missing(name, *args, &block) + return $custom_lists[name] if $custom_lists.has_key?(name) + super +end + +# this contains default symbols key overridden in the scheme file +# key will be the token type +$overridden_default_symbols = [] + +def _ensure_sanity_of_array(array) + # Possibilities are + # [e1, e2] + # [e1, [e2,e3], e4] + error "An empty array won't workout" if array.size == 0 + array.each do |element| + if element.is_a?(Array) + _ensure_sanity_of_array(element) + else + _ensure_type_safety(element) + end + end +end + +def _ensure_sanity_of_element(element) + if element.is_a?(Array) + _ensure_sanity_of_array(element) + else + _ensure_type_safety(element) + if element.is_a?(String) and element.length == 0 + error "Empty values are not allowed" + end + end +end + +def _ensure_type_safety(element) + valid_types = [Integer, String, Array] + error "#{element.class} is not a valid type. Valid types are #{valid_types.to_s}" if not valid_types.include?(element.class) +end + +def _ensure_sanity(hash) + if not hash.is_a?(Hash) + error "Expected a Hash, but got a #{hash.class}" + exit 1 + end + + hash.each_pair do |key, value| + _context.current_expression = "#{key} => #{value}" + + _ensure_sanity_of_element (key) + _ensure_sanity_of_element (value) + + warn "#{value} has more than three elements. Additional elements specified will be ignored" if value.is_a?(Array) and value.size > 3 + + _context.current_expression = nil + end +end + +def _extract_keys_values_and_persist(keys, values, token_type, match_type = Varnam::VARNAM_MATCH_EXACT, priority, accept_condition) + keys.each do |key| + if key.is_a?(Array) + # This a possibility match + key.flatten! + _extract_keys_values_and_persist(key, values, token_type, Varnam::VARNAM_MATCH_POSSIBILITY, priority, accept_condition) + else + _persist_key_values(key, values, token_type, match_type, priority, accept_condition) + end + end +end + +def _persist_key_values(pattern, values, token_type, match_type, priority, accept_condition) + return if _context.errors > 0 + + match = match_type == Varnam::VARNAM_MATCH_EXACT ? "EXACT" : "POSSIBILITY" + + if (values.is_a?(Array)) + values.flatten! + value1 = values[0] + value2 = values[1] if values.size >= 2 + value3 = values[2] if values.size >= 3 + else + value1 = values + value2 = "" + value3 = "" + end + + tag = _context.current_tag + tag = "" if tag.nil? + created = VarnamLibrary.vm_create_token($varnam_handle, pattern, value1, value2, value3, tag, token_type, match_type, priority, accept_condition, 1) + if created != 0 + error_message = VarnamLibrary.varnam_get_last_error($varnam_handle) + error error_message + return + end + + _context.tokens[token_type] = [] if _context.tokens[token_type].nil? + vtoken = VarnamSymbol.new(token_type, pattern, value1, value2, value3, tag, match_type, priority, accept_condition) + _context.tokens[token_type].push(vtoken) + push_to_current_custom_list vtoken +end + +def flush_unsaved_changes + saved = VarnamLibrary.vm_flush_buffer($varnam_handle) + if saved != 0 + error_message = VarnamLibrary.varnam_get_last_error($varnam_handle) + error error_message + return + end +end + +def infer_dead_consonants(infer) + configured = VarnamLibrary.varnam_config($varnam_handle, Varnam::VARNAM_CONFIG_USE_DEAD_CONSONANTS, infer ? 1 : 0) + if configured != 0 + error_message = VarnamLibrary.varnam_get_last_error($varnam_handle) + error error_message + return + end +end + +def ignore_duplicates(ignore) + configured = VarnamLibrary.varnam_config($varnam_handle, Varnam::VARNAM_CONFIG_IGNORE_DUPLICATE_TOKEN, ignore ? 1 : 0) + if configured != 0 + error_message = VarnamLibrary.varnam_get_last_error($varnam_handle) + error error_message + return + end +end + +def set_scheme_details() + d = VarnamLibrary::SchemeDetails.new + d[:identifier] = FFI::MemoryPointer.from_string($scheme_details[:identifier]) + d[:langCode] = FFI::MemoryPointer.from_string($scheme_details[:langCode]) + d[:displayName] = FFI::MemoryPointer.from_string($scheme_details[:displayName]) + d[:author] = FFI::MemoryPointer.from_string($scheme_details[:author]) + d[:compiledDate] = FFI::MemoryPointer.from_string(Time.now.to_s) + if $scheme_details[:isStable].nil? + d[:isStable] = 0 + else + d[:isStable] = $scheme_details[:isStable] + end + + done = VarnamLibrary.vm_set_scheme_details($varnam_handle, d.pointer) + if done != 0 + error_message = VarnamLibrary.varnam_get_last_error($varnam_handle) + error error_message + return + end +end + +$scheme_details = {} + +def language_code(code) + $scheme_details[:langCode] = code +end + +def identifier(id) + $scheme_details[:identifier] = id +end + +def display_name(name) + $scheme_details[:displayName] = name +end + +def author(name) + $scheme_details[:author] = name +end + +def stable(value) + $scheme_details[:isStable] = 0 + $scheme_details[:isStable] = 1 if value +end + +def generate_cv + all_vowels = get_vowels + all_consonants = get_consonants + + all_consonants.each do |c| + consonant_has_inherent_a_sound = c.pattern.end_with?('a') and not c.pattern[c.pattern.length - 2] == 'a' + all_vowels.each do |v| + next if v.value2.nil? or v.value2.length == 0 + + if consonant_has_inherent_a_sound + pattern = "#{c.pattern[0..c.pattern.length-2]}#{v.pattern}" + else + pattern = "#{c.pattern}#{v.pattern}" + end + + values = ["#{c.value1}#{v.value2}"] + if c.match_type == Varnam::VARNAM_MATCH_POSSIBILITY or v.match_type == Varnam::VARNAM_MATCH_POSSIBILITY + match_type = Varnam::VARNAM_MATCH_POSSIBILITY + else + match_type = Varnam::VARNAM_MATCH_EXACT + end + + accept_condition = nil + if not v.accept_condition == Varnam::VARNAM_TOKEN_ACCEPT_ALL and not c.accept_condition == Varnam::VARNAM_TOKEN_ACCEPT_ALL + accept_condition = v.accept_condition + elsif not v.accept_condition == Varnam::VARNAM_TOKEN_ACCEPT_ALL + accept_condition = v.accept_condition + else + accept_condition = c.accept_condition + end + + priority = Varnam::VARNAM_TOKEN_PRIORITY_NORMAL + if v.priority < c.priority + priority = v.priority + else + priority = c.priority + end + + + _persist_key_values pattern, values, Varnam::VARNAM_SYMBOL_CONSONANT_VOWEL, match_type, priority, accept_condition + end + end +end + +def delete_token(pattern: nil, value1: nil) + search_symbol_ptr = FFI::MemoryPointer.new :pointer + VarnamLibrary.varnam_new_search_symbol(search_symbol_ptr) + search_criteria = VarnamLibrary::Symbol.new(search_symbol_ptr.get_pointer(0)) + + # pattern && search_criteria.Pattern = FFI::MemoryPointer.from_string(pattern) + value1 && search_criteria.value1 = FFI::MemoryPointer.from_string(value1) + + done = VarnamLibrary.vm_delete_token($varnam_handle, search_criteria); + if done != 0 + error_message = VarnamLibrary.varnam_get_last_error($varnam_handle) + error error_message + end +end + +def combine_array(array, is_pattern, replacements, current_item) + if replacements.empty? + error 'Replacements should be present when combining an array. This could be a bug within varnamc' + exit (1) + end + + result = [] + array.each do |a| + if a.is_a?(Array) + result.push(combine_array(a, is_pattern, replacements, current_item)) + else + if is_pattern + if current_item.match_type == Varnam::VARNAM_MATCH_POSSIBILITY + result.push([a.to_s.gsub("*", replacements[0])]) + else + result.push(a.to_s.gsub("*", replacements[0])) + end + else + new_key = a.to_s.gsub("\*1", replacements[0]) + if replacements.length > 1 and not replacements[1].to_s.empty? + new_key = new_key.gsub("\*2", replacements[1]) + end + if replacements.length > 2 and not replacements[2].to_s.empty? + new_key = new_key.gsub("\*3", replacements[2]) + end + result.push (new_key) + end + end + end + + return result +end + +# Combines an array and a hash values +# This method also replaces the placeholder in hash +def combine(array, hash) + _ensure_sanity(hash) + if not array.is_a?(Array) + error "Expected an array, but got a #{array.class}" + exit 1 + end + + grouped = {} + array.each do |item| + hash.each_pair do |key, value| + new_key = nil + if key.is_a?(Array) + new_key = combine_array(key, true, [item.pattern], item) + else + if item.match_type == Varnam::VARNAM_MATCH_POSSIBILITY + new_key = [[key.to_s.gsub("*", item.pattern)]] + else + new_key = key.to_s.gsub("*", item.pattern) + end + end + + new_value = nil + if value.is_a?(Array) + new_value = combine_array(value, false, [item.value1, item.value2, item.value3], item) + else + new_value = value.to_s.gsub("\*1", item.value1) + if not item.value2.nil? and not item.value2.to_s.empty? + new_value = new_value.gsub("\*2", item.value2) + end + if not item.value3.nil? and not item.value3.to_s.empty? + new_value = new_value.gsub("\*3", item.value3) + end + end + + if grouped[new_value].nil? + grouped[new_value] = new_key + else + grouped[new_value].push(new_key) + end + end + end + + # invert the hash + result = {} + grouped.each_pair do |key, value| + result[value] = key + end + + return result +end + +def _create_token(hash, token_type, options = {}) + return if _context.errors > 0 + + priority = _get_priority options + accept_condition = _get_accept_condition options + + hash.each_pair do |key, value| + if key.is_a?(Array) + _extract_keys_values_and_persist(key, value, token_type, priority, accept_condition) + else + _persist_key_values(key, value, token_type, Varnam::VARNAM_MATCH_EXACT, priority, accept_condition) + end + end +end + +def _validate_number(number, name) + if not number.is_a?(Integer) + error "#{name} should be a number" + exit (1) + end +end + +def _get_priority(options) + return Varnam::VARNAM_TOKEN_PRIORITY_NORMAL if options[:priority].nil? or options[:priority] == :normal + return Varnam::VARNAM_TOKEN_PRIORITY_LOW if options[:priority] == :low + return Varnam::VARNAM_TOKEN_PRIORITY_HIGH if options[:priority] == :high + + _validate_number options[:priority], "priority" + + return options[:priority] +end + +def _get_accept_condition(options) + return Varnam::VARNAM_TOKEN_ACCEPT_ALL if options[:accept_if].nil? or options[:accept_if] == :all + return Varnam::VARNAM_TOKEN_ACCEPT_IF_STARTS_WITH if options[:accept_if] == :starts_with + return Varnam::VARNAM_TOKEN_ACCEPT_IF_IN_BETWEEN if options[:accept_if] == :in_between + return Varnam::VARNAM_TOKEN_ACCEPT_IF_ENDS_WITH if options[:accept_if] == :ends_with + + _validate_number options[:accept_if], "accept_if" +end + +def vowels(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_VOWEL, options) +end + +def consonants(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_CONSONANT, options) +end + +def period(p) + _create_token({"." => p}, Varnam::VARNAM_SYMBOL_PERIOD, {}) +end + +def tag(name, &block) + _context.current_tag = name + block.call + _context.current_tag = nil +end + +def consonant_vowel_combinations(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_CONSONANT_VOWEL, options) +end + +def anusvara(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_ANUSVARA, options) +end + +def visarga(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_VISARGA, options) +end + +def virama(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_VIRAMA, options) +end + +def symbols(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_SYMBOL, options) +end + +def numbers(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_NUMBER, options) +end + +def others(options={}, hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_OTHER, options) +end + +def non_joiner(hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_NON_JOINER); + $overridden_default_symbols.push Varnam::VARNAM_SYMBOL_NON_JOINER +end + +def joiner(hash) + _ensure_sanity(hash) + _create_token(hash, Varnam::VARNAM_SYMBOL_JOINER); + $overridden_default_symbols.push Varnam::VARNAM_SYMBOL_JOINER +end + +def get_tokens(token_type, criteria = {}) + tokens = _context.tokens[token_type] + if criteria.empty? + return tokens + elsif criteria[:exact] + return tokens.find_all {|t| t.match_type == Varnam::VARNAM_MATCH_EXACT} + else + return tokens.find_all {|t| t.match_type == Varnam::VARNAM_MATCH_POSSIBILITY} + end +end + +def get_vowels(criteria = {}) + return get_tokens(Varnam::VARNAM_SYMBOL_VOWEL, criteria) +end + +def get_consonants(criteria = {}) + return get_tokens(Varnam::VARNAM_SYMBOL_CONSONANT, criteria) +end + +def get_consonant_vowel_combinations(criteria = {}) + return get_tokens(Varnam::VARNAM_SYMBOL_CONSONANT_VOWEL, criteria) +end + +def get_anusvara(criteria = {}) + return get_tokens(Varnam::VARNAM_SYMBOL_ANUSVARA, criteria) +end + +def get_visarga(criteria = {}) + return get_tokens(Varnam::VARNAM_SYMBOL_VISARGA, criteria) +end + +def get_symbols(criteria = {}) + return get_tokens(Varnam::VARNAM_SYMBOL_SYMBOL, criteria) +end + +def get_numbers(criteria = {}) + return get_tokens(Varnam::VARNAM_SYMBOL_OTHER, criteria) +end + +def get_chill() + tokens = get_tokens(Varnam::VARNAM_SYMBOL_CONSONANT, {:exact => true}) + return tokens.find_all {|t| t.tag == "chill"} +end + +def get_virama + tokens = get_tokens(Varnam::VARNAM_SYMBOL_VIRAMA, {}) + if tokens.empty? + error 'Virama is not set' + exit (1) + end + return tokens[0] +end + +def ffito_string(value) + str = "" + ptr = value.to_ptr + if not ptr.null? + str = ptr.read_string + str.force_encoding('UTF-8') + end + return str +end + +def get_dead_consonants(criteria = {}) + # dead consonants are infered by varnam. ruby wrapper don't know anything about it. + symbol_type = Varnam::VARNAM_SYMBOL_DEAD_CONSONANT + + search_symbol_ptr = FFI::MemoryPointer.new :pointer + VarnamLibrary.varnam_new_search_symbol(search_symbol_ptr) + search_criteria = VarnamLibrary::Symbol.new(search_symbol_ptr.get_pointer(0)) + search_criteria[:Type] = symbol_type + + result_ptr = FFI::MemoryPointer.new :pointer + done = VarnamLibrary.varnam_search_symbol_table($varnam_handle, 0, search_criteria, result_ptr); + if done != 0 + error_message = VarnamLibrary.varnam_get_last_error($varnam_handle) + error error_message + return + end + + size = VarnamLibrary.varray_length(result_ptr.get_pointer(0)) + i = 0 + _context.tokens[symbol_type] = [] if _context.tokens[symbol_type].nil? + until i >= size + tok = VarnamLibrary.varray_get(result_ptr.get_pointer(0), i) + ptr = result_ptr.read_pointer + item = VarnamLibrary::Symbol.new(tok) + varnam_token = VarnamSymbol.new( + item[:Type], + item[:Pattern].force_encoding('UTF-8'), + item[:Value1].force_encoding('UTF-8'), + item[:Value2].force_encoding('UTF-8'), + item[:Value3].force_encoding('UTF-8'), + item[:Tag], + item[:MatchType], + item[:Priority], + item[:AcceptCondition], + item[:Flags], + item[:Weight] + ) + _context.tokens[symbol_type].push(varnam_token) + i += 1 + end + return get_tokens(symbol_type, criteria) +end + +# TODO warnings haven't been implemented even with libvarnam +def print_warnings_and_errors + if _context.warnings > 0 + _context.warning_messages.each do |msg| + puts msg + end + end + + if _context.errors > 0 + _context.error_messages.each do |msg| + puts msg + end + end +end + +# Sets default symbols if user has not set overridden in the scheme file +def set_default_symbols + non_joiner "_" => "_" if not $overridden_default_symbols.include?(Varnam::VARNAM_SYMBOL_NON_JOINER) + joiner "__" => "__" if not $overridden_default_symbols.include?(Varnam::VARNAM_SYMBOL_JOINER) + symbols "-" => "-" +end + +# TODO +# GoVarnam doesn't support stemming +def _persist_stemrules(old_ending, new_ending) + return if _context.errors > 0 + rc = VarnamLibrary.varnam_create_stemrule($varnam_handle, old_ending, new_ending) + if rc != 0 + error_message = VarnamLibrary.varnam_get_last_error($varnam_handle) + error error_message + end + return rc +end + +def _create_stemrule(hash, options) + return if _context.errors > 0 + hash.each_pair do |key,value| + rc = _persist_stemrules(key, value) + if rc != 0 + puts "could not create stemrule for " + key + ":" + value + end + end +end + +def stemrules(hash,options={}) + # _ensure_sanity(hash) + # _create_stemrule(hash, options) + puts VarnamLibrary.varnam_get_last_error($varnam_handle) +end + +def exceptions_stem(hash, options={}) + # hash.each_pair do |key,value| + # rc = VarnamLibrary.varnam_create_stem_exception($varnam_handle, key, value) + # if rc != 0 + # puts "Could not create stemrule exception" + # end + # end +end + +def compile_scheme(scheme_path, output_path) + file_name = File.basename(scheme_path) + if file_name.include?(".") + file_name = file_name.split(".")[0] + end + + $vst_name = file_name + ".vst" + $vst_path = output_path || File.join(Dir.pwd, $vst_name) + + if File.exists?($vst_path) + File.delete($vst_path) + end + + $varnam_handle = initialize_vst_maker_handle($vst_path) + + if $options[:verbose] + puts "Turning debug on" + VarnamLibrary.varnam_debug($varnam_handle, 1) + end + + puts "Compiling #{scheme_path}" + puts "Building #{$vst_path}" + + at_exit { + print_warnings_and_errors if _context.errors > 0 + puts "Completed with '#{_context.warnings}' warning(s) and '#{_context.errors}' error(s)" + } + + load scheme_path + set_default_symbols + flush_unsaved_changes + set_scheme_details + + if _context.errors > 0 + returncode = 1 + else + returncode = 0 + end + + exit(returncode) +end + +optparse = OptionParser.new do |opts| + opts.banner = "Usage: compile-schema options" + + $options[:verbose] = false + opts.on('-v', '--verbose', 'Enable verbose output') do + $options[:verbose] = true + end + + # ability to provide varnam library name + opts.on('-l', '--library FILE', 'Sets the varnam library') do |file| + if not File.exist?(file) + puts "Can't find #{file}" + exit 1 + end + $library = file + end + + if $options[:library].nil? + govarnam_lib = find_govarnam + if govarnam_lib.nil? + puts "Can't find govarnam shared library. Try specifying the full path using -l option" + exit 1 + else + puts "Using #{$govarnam_lib}" if $options[:verbose] + end + end + + $options[:debug] = false + opts.on('-z', '--debug', 'Enable debugging') do + $options[:debug] = true + end + + $options[:output] = nil + opts.on('-o', '-o output_path', 'Path to output VST') do |path| + $options[:output] = path + end + + opts.on('-s', '-s path_to_scheme_file_path', 'Path to scheme file') do |path| + $options[:scheme] = path + end +end + +optparse.parse! + +if File.exists? ($options[:scheme]) + compile_scheme($options[:scheme], $options[:output]) +else + puts "File doesn't exist" +end \ No newline at end of file diff --git a/govarnam/schemes/gu/README.md b/govarnam/schemes/gu/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/gu/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/gu/import.sh b/govarnam/schemes/gu/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/gu/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/hi/README.md b/govarnam/schemes/hi/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/hi/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/hi/import.sh b/govarnam/schemes/hi/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/hi/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/import.sh.in b/govarnam/schemes/import.sh.in new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/import.sh.in @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/install.sh.in b/govarnam/schemes/install.sh.in new file mode 100644 index 0000000..cffe11c --- /dev/null +++ b/govarnam/schemes/install.sh.in @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +ARG1=${1:-install} + +if [ "$ARG1" == "install" ]; then + # Install Scheme + + sudo mkdir -p @INSTALL_PREFIX@/share/varnam/schemes/ + sudo cp *.vst @INSTALL_PREFIX@/share/varnam/schemes/ + + msg="Installed basic $schemeID language support. Use import.sh for importing words" + echo "$msg" + notify-send "$msg" &> /dev/null || true +elif [ "$1" = "uninstall" ]; then + sudo rm "@INSTALL_PREFIX@/share/varnam/schemes/$schemeID.vst" + sudo rmdir "@INSTALL_PREFIX@/share/varnam/schemes/" + + echo "Uninstallation finished" +fi diff --git a/govarnam/schemes/install_all_schemes.sh b/govarnam/schemes/install_all_schemes.sh new file mode 100644 index 0000000..988719b --- /dev/null +++ b/govarnam/schemes/install_all_schemes.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +SUDO=${SUDO:-sudo} +PREFIX=/usr/local + +"${SUDO}" mkdir -p "${PREFIX}/share/varnam/schemes/" +for scheme in schemes/*/*.vst; do + echo $scheme; + "${SUDO}" cp $scheme "${PREFIX}/share/varnam/schemes/" +done diff --git a/govarnam/schemes/kn/README.md b/govarnam/schemes/kn/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/kn/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/kn/import.sh b/govarnam/schemes/kn/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/kn/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/ml-inscript/README.md b/govarnam/schemes/ml-inscript/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/ml-inscript/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/ml-inscript/import.sh b/govarnam/schemes/ml-inscript/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/ml-inscript/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/ml/README.md b/govarnam/schemes/ml/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/ml/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/ml/import.sh b/govarnam/schemes/ml/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/ml/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/ml/ml-basic/pack.json b/govarnam/schemes/ml/ml-basic/pack.json new file mode 100644 index 0000000..60f4c27 --- /dev/null +++ b/govarnam/schemes/ml/ml-basic/pack.json @@ -0,0 +1,58 @@ +{ + "identifier": "ml-basic", + "name": "Malayalam Basic", + "description": "Words sourced from Malayalam Wikipedia. Each file has maximum 30,000 words.", + "lang": "ml", + "pages_count": 8, + "total_words": 214618, + "pages": [ + { + "identifier": "ml-basic-1", + "page": 1, + "description": "Words with confidence lesser than 258", + "size": 1785319 + }, + { + "identifier": "ml-basic-2", + "page": 2, + "description": "Words with confidence lesser than 221", + "size": 1880995 + }, + { + "identifier": "ml-basic-3", + "page": 3, + "description": "Words with confidence lesser than 188", + "size": 1914151 + }, + { + "identifier": "ml-basic-4", + "page": 4, + "description": "Words with confidence lesser than 154", + "size": 1944715 + }, + { + "identifier": "ml-basic-5", + "page": 5, + "description": "Words with confidence lesser than 121", + "size": 1963957 + }, + { + "identifier": "ml-basic-6", + "page": 6, + "description": "Words with confidence lesser than 87", + "size": 1969771 + }, + { + "identifier": "ml-basic-7", + "page": 7, + "description": "Words with confidence lesser than 54", + "size": 1968964 + }, + { + "identifier": "ml-basic-8", + "size": 310457, + "description": "Words with confidence lesser than 20", + "page": 8 + } + ] +} \ No newline at end of file diff --git a/govarnam/schemes/ml/ml-english/pack.json b/govarnam/schemes/ml/ml-english/pack.json new file mode 100644 index 0000000..7a267b4 --- /dev/null +++ b/govarnam/schemes/ml/ml-english/pack.json @@ -0,0 +1,16 @@ +{ + "identifier": "ml-english", + "name": "English words in Malayalam", + "description": "Words like india => ഇന്ത്യ, scene => സീൻ etc.", + "lang": "ml", + "pages_count": 1, + "total_words": 1465, + "pages": [ + { + "identifier": "ml-english-1", + "page": 1, + "description": "Words with confidence lesser than 34", + "size": 144837 + } + ] +} \ No newline at end of file diff --git a/govarnam/schemes/mr/README.md b/govarnam/schemes/mr/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/mr/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/mr/import.sh b/govarnam/schemes/mr/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/mr/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/ne/README.md b/govarnam/schemes/ne/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/ne/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/ne/import.sh b/govarnam/schemes/ne/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/ne/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/or/README.md b/govarnam/schemes/or/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/or/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/or/import.sh b/govarnam/schemes/or/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/or/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/pa/README.md b/govarnam/schemes/pa/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/pa/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/pa/import.sh b/govarnam/schemes/pa/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/pa/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/sa/README.md b/govarnam/schemes/sa/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/sa/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/sa/import.sh b/govarnam/schemes/sa/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/sa/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/schemes/README.md b/govarnam/schemes/schemes/README.md new file mode 100644 index 0000000..46b1427 --- /dev/null +++ b/govarnam/schemes/schemes/README.md @@ -0,0 +1,13 @@ +# Making A Language Pack + +``` +cd ml +mkdir ml-basic + +# Copy frequency report as report.txt in here + +# Make pack.json + +# Run +python3 scripts/make-pack.py ml ./ +``` diff --git a/govarnam/schemes/schemes/as/as.scheme b/govarnam/schemes/schemes/as/as.scheme new file mode 100644 index 0000000..07f6510 --- /dev/null +++ b/govarnam/schemes/schemes/as/as.scheme @@ -0,0 +1,216 @@ +# encoding: utf-8 + +## +# Copyright (C) Madhumita Ghar +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "as" +identifier "as" +display_name "Assamese" +author "Madhumita Ghar" + +ignore_duplicates false + +$virama = "\u{09CD}" +$nukta = "\u{09BC}" + +$an = "\u{0981}" +$am = "\u{0982}" + +virama "~" => "\u{09CD}" +visarga "H" => "\u{0983}" + +# default sound in Bengali is "o" or "au" +# a is pronounced as au +vowels ["a", ["o", "au"]] => "অ", + ["aa", "A", ["a"]] => ["আ", "া"], + "i" => ["ই", "ি"], + ["ee", "I", "ii", ["i"]] => ["ঈ", "ী"], + "u" => ["উ", "ু"], + ["uu", "oo", "U", ["u"]] => ["ঊ", "ূ"], + + ["R", ["ri", "ru"]] => ["ঋ", "ৃ"], + ["Rr", ["Ri", "Ru", "R"]] => ["ৠ", "ৄ"], + ["L", ["li", "lu"]] => ["ঌ", "ৢ"], + ["Ll", ["Li", "Lu", "L"]] => ["ৡ", "ৣ"], + + "e" => ["এ", "ে"], + +# ai is pronounced is oi + ["oi", "ai", "ei"] => ["ঐ", "ৈ"], + ["o", ["a"]] => ["ও", "\u{09CB}"], + ["ou", "ow", ["au"]] => ["ঔ", "\u{09CC}"], + + ["AN"] => ["আ#{$an}", "া#{$an}"], + ["uN"] => ["উ#{$an}", "ু#{$an}"], + ["UN"] => ["ঊ#{$an}", "ূ#{$an}"], + ["N", "M"] => ["অং","\u{0982}"], + ["aH"] => ["অঃ", "\u{0983}"] + +list :can_make_cluster do +consonants ["k", ["c"]] => "ক", + ["kh", ["k"]] => "খ", + "g" => "গ", + ["gh", ["g"]] => "ঘ", + "ng" => "ঙ", + + "ch" => "চ", + ["Ch", ["ch"]] => "ছ", + "j" => ["জ", ["য"]], + ["jh", ["j"]] => "ঝ", + "Ny" => "ঞ", + + "T" => "ট", + ["Th", ["T"]] => "ঠ", + "D" => "ড", + ["DD", ["r", "d"]] => ["ড়", "ড#{$nukta}"], + ["Dh", ["D"]] => "ঢ", + "DH" => ["ঢ়", "ঢ#{$nukta}"], + ["NN", ["n"]] => "ণ", + + "t" => ["ত", ["ৎ"]], + ["th", ["t"]] => "থ", + "d" => "দ", + ["dh", ["d"]] => "ধ", + "n" => "ন", + + "p" => "প", + ["ph", "f"] => "ফ", + +# following is different from Bengali + ["b", "v", "w"] => "ৱ", +# end of diff + ["bh", ["b"]] => "ভ", + "m" => "ম", + +# ya is pronounced as ja +# ya+nukta is pronounced as ya + ["y", ["j", "ae"]] => "য", + ["Y", ["y"]] => ["য়", "য#{$nukta}"], +# following is different from Bengali + "r" => "ৰ", +# end of diff + "l" => "ল", + "s" => "স", +# sha also expands to swa + ["sh", ["s"]] => ["শ", ["স্ৱ"]], + ["Sh", ["sh"]] => "ষ", + "h" => "হ" +end + +# conjuncts for more than 2 consonants (except first one) +# low-priority ones are added due to rule mentioned in line 150 onwards +consonants ["kkh", ["kSh"]] => "ক্ষ", # mapping kkh also to क्ष + ["kry", "krj", ["krr"]] => "ক্ৰ্য", + ["kShNN", "kkhNN"] => "ক্ষ্ণ", + ["kShn", "kkhn"] => "ক্ষ্ন", + ["kShm", "kkhm"] => "ক্ষ্ম", + ["kShy", "kShj", "kkhy", "kkhj"] => "ক্ষ্য", + "ngkl" => "ঙ্ক্ল", + ["ngkSh", "ngkkh"] => "ঙ্ক্ষ", + ["nggy", "nggj", ["nggg"]] => "ঙ্গ্য", + ["chChv", "chChw", "chChb", ["chChCh"]] => "চ্ছ্ৱ", + ["jjv", "jjw", "jjb", ["jjj"]] => "জ্জ্ৱ", + "NNtr" => "ণ্ত্ৰ", + ["NNthy", "NNthj", ["NNthth"]] => "ণ্থ্য", + ["ttv", "ttw", "ttb", ["ttt"]] => "ত্ত্ৱ", + "ttr" => "ত্ত্ৰ", + ["try", "trj", ["trr"]] => "ত্ৰ্য", + ["thry", "thrj", ["thrr"]] => "থ্ৰ্য", + "dbhr" => "দ্ভ্ৰ", + ["dgy", "dgj", ["dgg"]] => "দ্গ্য", + ["ddy", "ddj", ["ddd"]] => "দ্দ্য", + "ddr" => "দ্দ্ৰ", + "ddhr" => "দ্ধ্ৰ", + ["dvy", "dwy", "dby", "dvj", "dwj", "dbj", ["dbb"]] => "দ্ৱ্য", + ["dry", "drj", ["drr"]] => "দ্ৰ্য", + ["dhry", "dhrj", ["dhrr"]] => "ধ্ৰ্য", + ["ntv", "ntw", "ntb", ["ntt"]] => "ন্ত্ৱ", + "ntr" => "ন্ত্ৰ", + ["ntry", "ntrj", ["ntrr"]] => "ন্ত্ৰ্য", + ["ndv", "ndw", "ndb", ["ndd"]] => "ন্দ্ৱ", + ["ndy", "ndj", ["ndd"]] => "ন্দ্য", + "ndr" => "ন্দ্ৰ", + ["ndry", "ndrj", ["ndrr"]] => "ন্দ্ৰ্য", + ["ndhy", "ndhj", ["ndhdh"]] => "ন্ধ্য", + "ndhr" => "ন্ধ্ৰ", + ["pry", "prj", ["prr"]] => "প্ৰ্য", + "mpr" => "ম্প্ৰ", + "mbhr" => "ম্ভ্ৰ", + "rkk" => "ৰ্ক্ক", + "rkT" => "ৰ্ক্ট", + ["rdhv", "rdhw", "rdhb", ["rdhdh"]] => "ৰ্ধ্ৱ", + "Shkr" => "ষ্ক্ৰ", + "Shkl" => "ষ্ক্ল", + "Shtr" => "ষ্ত্ৰ", + ["ShThy", "ShThj", ["ShThTh"]] => "ষ্ঠ্য", + "sTr" => "স্ট্ৰ", + "str" => "স্ত্ৰ", + ["sthy", "sthj", ["sthth"]] => "স্থ্য", + "spr" => "স্প্ৰ", + "spl" => "স্প্ল", +# smriti can be shriti + ["smr", ["shr"]] => "স্ম্ৰ" + +ignore_duplicates true +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + +# Bengali, apparently, has a rule that if a half consonant comes +# before ya, va, or ma, then that consonant gets repeated and +# ya, va, ma will be eliminated e.g. anya > anna / onno +# mahatva > mohotto + + if c1.pattern == c2.pattern + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}য"] + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}ৱ"] + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}ম"] + else + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + end + end +end + +# Half forms for ম +consonants(combine can_make_cluster, ["m*"] => ["ম#{$virama}*1"]) + +generate_cv + +# default sound associated with every consonant is "o" +consonants(combine get_consonants, ["*a"] => ["*1"]) +consonants(combine get_consonants, [["*o"]] => ["*1"]) + +ignore_duplicates true +#ignore_duplicates false +#consonants [["m"]] => "ম" + +# Need to replace this when we have a way to pass value2 and value3 in combine +get_vowels.each do |vowel| + if vowel.match_type == 1 + consonant_vowel_combinations ["m#{vowel.pattern}"] => ["ম#{vowel.value2}"] + else + consonant_vowel_combinations [["m#{vowel.pattern}"]] => ["ম#{vowel.value2}"] + end +end +symbols ["m", ["n"]] => $am, + [["n", "m"]] => $an, + ["|"] => "।" + +symbols({:priority => :low}, ["aan", "aam"] => $an) + +numbers "0" => "০", + "1" => "১", + "2" => "২", + "3" => "৩", + "4" => "৪", + "5" => "৫", + "6" => "৬", + "7" => "৭", + "8" => "৮", + "9" => "৯" diff --git a/govarnam/schemes/schemes/bn/bn.scheme b/govarnam/schemes/schemes/bn/bn.scheme new file mode 100644 index 0000000..5627794 --- /dev/null +++ b/govarnam/schemes/schemes/bn/bn.scheme @@ -0,0 +1,211 @@ +# encoding: utf-8 + +## +# Copyright (C) Madhumita Ghar +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "bn" +identifier "bn" +display_name "Bengali" +author "Madhumita Ghar" + +ignore_duplicates false + +$virama = "\u{09CD}" +$nukta = "\u{09BC}" + +$an = "\u{0981}" +$am = "\u{0982}" + +virama "~" => "\u{09CD}" +visarga "H" => "\u{0983}" + +# default sound in Bengali is "o" or "au" +# a is pronounced as au +vowels ["a", ["o", "au"]] => "অ", + ["aa", "A", ["a"]] => ["আ", "া"], + "i" => ["ই", "ি"], + ["ee", "I", "ii", ["i"]] => ["ঈ", "ী"], + "u" => ["উ", "ু"], + ["uu", "oo", "U", ["u"]] => ["ঊ", "ূ"], + + ["R", ["ri", "ru"]] => ["ঋ", "ৃ"], + ["Rr", ["Ri", "Ru", "R"]] => ["ৠ", "ৄ"], + ["L", ["li", "lu"]] => ["ঌ", "ৢ"], + ["Ll", ["Li", "Lu", "L"]] => ["ৡ", "ৣ"], + + "e" => ["এ", "ে"], + +# ai is pronounced is oi + ["oi", "ai", "ei"] => ["ঐ", "ৈ"], + ["o", ["a"]] => ["ও", "\u{09CB}"], + ["ou", "ow", ["au"]] => ["ঔ", "\u{09CC}"], + + ["AN"] => ["আ#{$an}", "া#{$an}"], + ["uN"] => ["উ#{$an}", "ু#{$an}"], + ["UN"] => ["ঊ#{$an}", "ূ#{$an}"], + ["N", "M"] => ["অং","\u{0982}"], + ["aH"] => ["অঃ", "\u{0983}"] + +list :can_make_cluster do +consonants ["k", ["c"]] => "ক", + ["kh", ["k"]] => "খ", + "g" => "গ", + ["gh", ["g"]] => "ঘ", + "ng" => "ঙ", + + "ch" => "চ", + ["Ch", ["ch"]] => "ছ", + "j" => ["জ", ["য"]], + ["jh", ["j"]] => "ঝ", + "Ny" => "ঞ", + + "T" => "ট", + ["Th", ["T"]] => "ঠ", + "D" => "ড", + ["DD", ["r", "d"]] => ["ড়", "ড#{$nukta}"], + ["Dh", ["D"]] => "ঢ", + "DH" => ["ঢ়", "ঢ#{$nukta}"], + ["NN", ["n"]] => "ণ", + + "t" => ["ত", ["ৎ"]], + ["th", ["t"]] => "থ", + "d" => "দ", + ["dh", ["d"]] => "ধ", + "n" => "ন", + + "p" => "প", + ["ph", "f"] => "ফ", + ["b", "v", "w"] => "ব", + ["bh", ["b"]] => "ভ", + "m" => "ম", + +# ya is pronounced as ja +# ya+nukta is pronounced as ya + ["y", ["j", "ae"]] => "য", + ["Y", ["y"]] => ["য়", "য#{$nukta}"], + "r" => "র", + "l" => "ল", + "s" => "স", +# sha also expands to swa + ["sh", ["s"]] => ["শ", ["স্ব"]], + ["Sh", ["sh"]] => "ষ", + "h" => "হ" +end + +# conjuncts for more than 2 consonants (except first one) +# low-priority ones are added due to rule mentioned in line 150 onwards +consonants ["kkh", ["kSh"]] => "ক্ষ", # mapping kkh also to क्ष + ["kry", "krj", ["krr"]] => "ক্র্য", + ["kShNN", "kkhNN"] => "ক্ষ্ণ", + ["kShn", "kkhn"] => "ক্ষ্ন", + ["kShm", "kkhm"] => "ক্ষ্ম", + ["kShy", "kShj", "kkhy", "kkhj"] => "ক্ষ্য", + "ngkl" => "ঙ্ক্ল", + ["ngkSh", "ngkkh"] => "ঙ্ক্ষ", + ["nggy", "nggj", ["nggg"]] => "ঙ্গ্য", + ["chChv", "chChw", "chChb", ["chChCh"]] => "চ্ছ্ব", + ["jjv", "jjw", "jjb", ["jjj"]] => "জ্জ্ব", + "NNtr" => "ণ্ত্র", + ["NNthy", "NNthj", ["NNthth"]] => "ণ্থ্য", + ["ttv", "ttw", "ttb", ["ttt"]] => "ত্ত্ব", + "ttr" => "ত্ত্র", + ["try", "trj", ["trr"]] => "ত্র্য", + ["thry", "thrj", ["thrr"]] => "থ্র্য", + "dbhr" => "দ্ভ্র", + ["dgy", "dgj", ["dgg"]] => "দ্গ্য", + ["ddy", "ddj", ["ddd"]] => "দ্দ্য", + "ddr" => "দ্দ্র", + "ddhr" => "দ্ধ্র", + ["dvy", "dwy", "dby", "dvj", "dwj", "dbj", ["dbb"]] => "দ্ব্য", + ["dry", "drj", ["drr"]] => "দ্র্য", + ["dhry", "dhrj", ["dhrr"]] => "ধ্র্য", + ["ntv", "ntw", "ntb", ["ntt"]] => "ন্ত্ব", + "ntr" => "ন্ত্র", + ["ntry", "ntrj", ["ntrr"]] => "ন্ত্র্য", + ["ndv", "ndw", "ndb", ["ndd"]] => "ন্দ্ব", + ["ndy", "ndj", ["ndd"]] => "ন্দ্য", + "ndr" => "ন্দ্র", + ["ndry", "ndrj", ["ndrr"]] => "ন্দ্র্য", + ["ndhy", "ndhj", ["ndhdh"]] => "ন্ধ্য", + "ndhr" => "ন্ধ্র", + ["pry", "prj", ["prr"]] => "প্র্য", + "mpr" => "ম্প্র", + "mbhr" => "ম্ভ্র", + "rkk" => "র্ক্ক", + "rkT" => "র্ক্ট", + ["rdhv", "rdhw", "rdhb", ["rdhdh"]] => "র্ধ্ব", + "Shkr" => "ষ্ক্র", + "Shkl" => "ষ্ক্ল", + "Shtr" => "ষ্ত্র", + ["ShThy", "ShThj", ["ShThTh"]] => "ষ্ঠ্য", + "sTr" => "স্ট্র", + "str" => "স্ত্র", + ["sthy", "sthj", ["sthth"]] => "স্থ্য", + "spr" => "স্প্র", + "spl" => "স্প্ল", +# smriti can be shriti + ["smr", ["shr"]] => "স্ম্র" + +ignore_duplicates true +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + +# Bengali, apparently, has a rule that if a half consonant comes +# before ya, va, or ma, then that consonant gets repeated and +# ya, va, ma will be eliminated e.g. anya > anna / onno +# mahatva > mohotto + + if c1.pattern == c2.pattern + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}য"] + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}ব"] + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}ম"] + else + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + end + end +end + +# Half forms for ম +consonants(combine can_make_cluster, ["m*"] => ["ম#{$virama}*1"]) + +generate_cv + +# default sound associated with every consonant is "o" +consonants(combine get_consonants, ["*a"] => ["*1"]) +consonants(combine get_consonants, [["*o"]] => ["*1"]) + +ignore_duplicates true +#ignore_duplicates false +#consonants [["m"]] => "ম" + +# Need to replace this when we have a way to pass value2 and value3 in combine +get_vowels.each do |vowel| + if vowel.match_type == 1 + consonant_vowel_combinations ["m#{vowel.pattern}"] => ["ম#{vowel.value2}"] + else + consonant_vowel_combinations [["m#{vowel.pattern}"]] => ["ম#{vowel.value2}"] + end +end +symbols ["m", ["n"]] => $am, + [["n", "m"]] => $an, + ["|"] => "।" + +symbols({:priority => :low}, ["aan", "aam"] => $an) + +numbers "0" => "০", + "1" => "১", + "2" => "২", + "3" => "৩", + "4" => "৪", + "5" => "৫", + "6" => "৬", + "7" => "৭", + "8" => "৮", + "9" => "৯" diff --git a/govarnam/schemes/schemes/gu/gu.scheme b/govarnam/schemes/schemes/gu/gu.scheme new file mode 100644 index 0000000..a2a3567 --- /dev/null +++ b/govarnam/schemes/schemes/gu/gu.scheme @@ -0,0 +1,201 @@ +# encoding: utf-8 + +## +# Copyright (C) Maulik DS +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "gu" +identifier "gu" +display_name "Gujarati" +author "Rohit Bansal" + +$nukta = "\u{0ABC}" +$virama = "\u{0ACD}" + +virama "~" => "\u{0ACD}" +visarga "H" => "\u{0A83}" + +vowels "a" => "અ", + ["aa", "A", ["a"]] => ["આ", "ા"], + ["AA", ["aa"]] => ["આઽઽ", "ાઽઽ"], + ["i", ["y"]] => ["ઇ", "િ"], + ["ii", "I", "ee", ["i"]] => ["ઈ", "ી"], + "u" => ["ઉ","ુ"], + ["uu", "oo", "U", ["u"]] => ["ઊ", "ૂ"], + + ["R", ["r"]] => ["ઋ", "\u{0AC3}"], + ["RR", ["R"]] => ["ૠ", "\u{0AC4}"], + ["Lr", ["lr"]] => ["ઌ", "\u{0AE2}"], + ["LLr", ["Lr"]] => ["ૡ", "\u{0AE3}"], + + "e" => ["ઍ", "\u{0AC5}"], + ["E", ["e"]] => ["એ", "ે"], + "ea" => ["એઽ", "ેઽ"], + ["eA", ["ea"]] => ["એઽઽ", "ેઽઽ"], + ["ai", "ei"] => ["ઐ", "ૈ"], + "o" => ["ઑ", "\u{0AC9}"], + ["O", ["o"]] => ["ઓ", "ો"], + "oa" => ["ઓઽ", "ોઽ"], + ["au", "ow", "ou"] => ["ઔ", "ૌ"], + ["N", "M"] => ["અં", "\u{0A82}"], + ["AN"] => ["આ\u{0A81}", "ા\u{0A81}"], + ["IN"] => ["ઈ\u{0A81}", "ી\u{0A81}"], + ["UN"] => ["ઊ\u{0A81}", "ૂ\u{0A81}"] + + +ignore_duplicates true +#vowels ["H"] => ["અઃ", "\u{0A83}"] +ignore_duplicates false + +list :can_make_cluster do +consonants "k" => "ક", + ["kh", ["k"]] => "ખ", + "g" => "ગ", + ["gh", ["g"]] => "ઘ", + ["NG", ["ng"]] => "ઙ", + + "ch" => "ચ", + ["Ch", ["ch"]] => "છ", + "j" => "જ", + ["jh", ["j"]] => "ઝ", + ["NJ", ["nj"]] => "ઞ", + + "T" => "ટ", + ["TH", ["T"]] => "ઠ", + "D" => "ડ", + ["DH",["D"]] => "ઢ", + "NN" => "ણ", + + "t" => "ત", + ["th", ["t"]] => "થ", + "d" => "દ", + ["dh", ["d"]] => "ધ", + "n" => "ન", + + "p" => "પ", + ["ph", "f", ["p"]] => "ફ", + "b" => "બ", + ["bh", ["b"]] => "ભ", + "m" => "મ", + + "y" => "ય", + "r" => "ર", + "l" => "ળ", + ["L", ["l"]] => "લ", + ["v", "w"] => "વ", + "s" => "સ", + ["sh", ["s"]] => "શ", + ["Sh", ["sh"]] => "ષ", + "h" => "હ" +end + +ignore_duplicates true + +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + end +end + +# following are the consonants clusters, having more than 2 consonants +consonants "kTr" => "ક્ટ્ર", + "kty" => "ક્ત્ય", + "ktr" => "ક્ત્ર", + ["ktv", "ktw"] => "ક્ત્વ", + "kry" => "ક્ર્ય", + "kShm" => "ક્ષ્મ", + "kShy" => "ક્ષ્ય", + ["kShv", "kShw"] => "ક્ષ્વ", + ["gdhv", "gdhw"] => "ગ્ધ્વ", + "gry" => "ગ્ર્ય", + "ghry" => "ઘ્ર્ય", + "NGkSh" => "ઙ્ક્ષ", + "chChr" => "ચ્છ્ર", + ["chChv", "chChw"] => "ચ્છ્વ", + "jjNJ" => "જ્જ્ઞ", + ["jjv", "jjw"] => "જ્જ્વ", + "jNJy" => "જ્ઞ્ય", + "NJchy" => "ઞ્ચ્ય", + "NJjNJ" => "ઞ્જ્ઞ", + "NJjy" => "ઞ્જ્ય", + ["NJjv", "NJjw"] => "ઞ્જ્વ", + "Try" => "ટ્ર્ય", + "THry" => "ઠ્ર્ય", + "Dry" => "ડ્ર્ય", + "DHry" => "ઢ્ર્ય", + "NNtr" => "ણ્ત્ર", + "NNDr" => "ણ્ડ્ર", + "tty" => "ત્ત્ય", + "ttr" => "ત્ત્ર", + ["ttv", "ttw"] => "ત્ત્વ", + "tny" => "ત્ન્ય", + "try" => "ત્ર્ય", + ["trv", "trw"] => "ત્ર્વ", + "thry" => "થ્ર્ય", + "dgr" => "દ્ગ્ર", + "dghr" => "દ્ઘ્ર", + "ddr" => "દ્દ્ર", + ["ddv", "ddw"] => "દ્દ્વ", + "ddhr" => "દ્ધ્ર", + ["ddhv", "ddhw"] => "દ્ધ્વ", + "dbr" => "દ્્ર", + "dbhr" => "દ્ભ્ર", + "dmy" => "દ્મ્ય", + "dry" => "દ્ર્ય", + "dhry" => "ધ્ર્ય", + "ntr" => "ન્ત્ર", + ["ntv", "ntw"] => "ન્ત્વ", + "nddh" => "ન્દ્ધ", + "ndy" => "ન્દ્ય", + "ndr" => "ન્દ્ર", + ["ndv", "ndw"] => "ન્દ્વ", + "ndhr" => "ન્ધ્ર", + ["ndhv", "ndhw"] => "ન્ધ્વ", + "nny" => "ન્ન્ય", + "pty" => "પ્ત્ય", + "ptr" => "પ્ત્ર", + ["ptv", "ptw"] => "પ્ત્વ", + ["bdhv", "bdhw"] => "બ્ધ્વ", + "rty" => "ર્ત્ય", + "rtr" => "ર્ત્ર", + "rts" => "ર્ત્સ", + "lTr" => "લ્ટ્ર", + "lDr" => "લ્ડ્ર", + "shchy" => "શ્ચ્ય", + "shny" => "શ્ન્ય", + "shry" => "શ્ર્ય", + ["shrv", "shrw"] => "શ્ર્વ", + "shly" => "શ્લ્ય", + ["shvy", "shwy"] => "શ્વ્ય", + "Shkr" => "ષ્ક્ર", + "ShTr" => "ષ્ટ્ર", + ["ShTv", "ShTw"] => "ષ્ટ્વ", + "skr" => "સ્ક્ર", + "sTr" => "સ્ટ્ર", + "str" => "સ્ત્ર", + ["stv", "stw"] => "સ્ત્વ", + "sry" => "સ્ર્ય", + ["srv", "srw"] => "સ્ર્વ" + +generate_cv + +consonants(combine get_consonants, ["*a"] => ["*1"]) + +symbols "om" => "ૐ" + +numbers "0" => "૦", + "1" => "૧", + "2" => "૨", + "3" => "૩", + "4" => "૪", + "5" => "૫", + "6" => "૬", + "7" => "૭", + "8" => "૮", + "9" => "૯" diff --git a/govarnam/schemes/schemes/hi/hi.scheme b/govarnam/schemes/schemes/hi/hi.scheme new file mode 100644 index 0000000..b1b3397 --- /dev/null +++ b/govarnam/schemes/schemes/hi/hi.scheme @@ -0,0 +1,133 @@ +# encoding: utf-8 + +## +# Copyright (C) Prateek Kumar Baheti +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "hi" +identifier "hi" +display_name "Hindi" +author "Prateek Kumar Baheti" +stable true + +ignore_duplicates false + +$virama = "\u{094D}" +$am = "\u{0902}" +$an = "\u{0901}" +$nukta = "\u{093c}" + +virama "~" => "\u{094d}" +period "\u{0964}" + +visarga "H" => "\u{0903}" + +vowels "a" => "अ", + ["aa", "A", ["a"]] => ["आ", "ा"], + "i" => ["इ", "ि"], + ["ee", "I", "ii", ["i"]] => ["ई", "ी"], + "u" => ["उ", "ु"], + ["uu", "oo", "U", ["u"]] => ["ऊ", "ू"], + [["ri", "ru", "r"], "R"] => ["ऋ", "\u{0943}"], + "e" => ["ए", "े"], + ["ai", "ei"] => ["ऐ", "ै"], + "o" => ["ओ", "ो"], + "O" => ["ओ", "\u{0949}"], + ["ou", "au", "ow"] => ["औ", "ौ"], + ["aN", "aM",] => ["अं" ,"ं"], + ["aH"] => ["अः", "ः"] + +list :can_make_cluster do +consonants "k" => "क", + [["k"], "q"] => ["\u{0958}", "क#{$nukta}"], + ["kh", ["gh"]] => "ख", + [["kh"], "KH"] => ["\u{0959}", "ख#{$nukta}"], + ["gh", ["kh"]] => "घ", + ["T", ["t"]] => "ट", + ["Th"] => "ठ", + ["D", ["d"]] => "ड", + [["d"], "dd"] => ["\u{095C}", "ड#{$nukta}"], + "d" => "द", + ["Dh"] => ["ढ"], + "DH" => ["\u{095D}", "ढ#{$nukta}"], + "p" => "प", + "ph" => "फ", + "b" => "ब", + "bh" => "भ", + ["t"] => "त", + ["th", ["dh"]] => "थ", + "g" => "ग", + ["G"] => ["\u{095A}", "ग#{$nukta}"], + ["j"] => "ज", + "ng" => "ङ", + "ch" => "च", + ["CH", ["ch"]] => "छ", + ["jh"] => "झ", + ["nj"] => "ञ", + ["N"] => "ण", + ["dh"] => "ध", + ["n"] => "न", + "y" => "य", + "r" => ["र", "र#{$virama}"], + "l" => ["ल", "ल#{$virama}"], + ["v", "w"] => "व", + ["sh", ["s"]] => "श", + ["Sh", ["sh"]] => "ष", + ["h"] => "ह", + ["z"] => "ज#{$nukta}", + ["s"] => "स", + "f" => ["\u{095E}", "फ#{$nukta}"], + "y" => "य", + [["y"], "YY"] => ["\u{095F}", "य#{$nukta}"] +end + +ignore_duplicates true +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + end +end + +# Half forms for म +consonants(combine can_make_cluster, ["m*"] => ["म#{$virama}*1"]) + +generate_cv + +consonants(combine get_consonants, ["*a"] => ["*1"]) + +ignore_duplicates false + +consonants [["m"]] => "म" + +# Need to replace this when we have a way to pass value2 and value3 in combine +get_vowels.each do |vowel| + if vowel.match_type == 1 + consonant_vowel_combinations ["m#{vowel.pattern}"] => ["म#{vowel.value2}"] + else + consonant_vowel_combinations [["m#{vowel.pattern}"]] => ["म#{vowel.value2}"] + end +end +symbols ["m", ["n"]] => $am, + [["n", "m"]] => $an, + ["|"] => "।" + +symbols({:priority => :low}, ["aan", "aam"] => $an) + + + +numbers "0" => "०", + "1" => "१", + "2" => "२", + "3" => "३", + "4" => "४", + "5" => "५", + "6" => "६", + "7" => "७", + "8" => "८", + "9" => "९" diff --git a/govarnam/schemes/schemes/kn/kn.scheme b/govarnam/schemes/schemes/kn/kn.scheme new file mode 100644 index 0000000..ae74606 --- /dev/null +++ b/govarnam/schemes/schemes/kn/kn.scheme @@ -0,0 +1,122 @@ +# encoding: utf-8 + +## +# Copyright (C) Shwetha Thammaiah, Sharath Battaje +# +# This is part of libvarnam. See LICENSE.txt for the license +# +# This is a Ruby file, and hence follows Ruby syntax +## + +language_code "kn" +identifier "kn" +display_name "Kannada" +author "Shwetha Thammaiah, Sharath Kr. Battaje" + +ignore_duplicates false + +$am = "\u{0C82}" +$an = "\u{0C81}" + +$virama = "\u{0CCD}" +$nukta = "\u{0CBC}" +$avagraha = "\u{0CBD}" + +virama "~" => "\u{0CCD}" + +infer_dead_consonants true + +vowels "a" => "ಅ", + ["aa", "A", ["a"]] => ["ಆ", "ಾ"], + "i" => ["ಇ", "ಿ"], + ["ee", "I", "ii", ["i"]] => ["ಈ", "ೀ"], + "u" => ["ಉ", "ು"], + ["uu", "oo", "U", ["u"]] => ["ಊ", "ೂ"], + ["rRu"] => ["ಋ", "\u{0CC3}"], + ["Rru"] => ["ೠ", "\u{0CC4}"], + ["e",["y"]] => ["ಎ", "ೆ"], + ["E", ["e"]] => ["ಏ", "\u{0CC7}"], + ["ai", "ei"] => ["ಐ", "ೈ"], + "o" => ["ಒ", "ೊ"], + ["O", ["o"]] => ["ಓ", "ೋ"], + ["ou", "au", "ow"] => ["ಔ", "ೌ"], + ["aN", "aM"] => ["ಅಂ" ,"ಂ"], + ["oN", "oM"] => ["ಒಂ" ,"ಂ"], + ["aH"] => ["ಅಃ", "ಃ"] + +vowels({:priority => :low, :accept_if => :ends_with}, [["y"]] => ["ഇ", "ി"]) + + +consonants ["ka", "ca"] => "ಕ", + ["kha", ["gha"]] => "ಖ", + "ga" => "ಗ", + ["gha", ["kha"]] => "ಘ", + ["NGa", ["nga"]] => "ಙ", + "cha" => "ಚ", + ["CHa", ["cha", "jha"]] => "ಛ", + "ja" => "ಜ", + ["jha", "JHa"] => "ಝ", + [["nja"], "NJa"] => "ಞ", + ["ta", ["tta"]] => "ಟ", + [["da", "ta"], "Ta"] => "ಠ", + [["da", "ta"], "Da"] => "ಡ", + [["da"], "Dha"] => "ಢ", + ["Na"] => "ಣ", + ["tha", ["ta"]] => "ತ", + ["THa"] => "ತ್ತ", + [["tha", "dha"], "thha"] => "ಥ", + "da"=> "ದ", + "dha" => "ಧ", + "pa" => "ಪ", + ["pha", ["fa"]] => "ಫ", + [["pha"], "fa"] => "ೞ", + "ba" => "ಬ", + "bha" => "ಭ", + "ya" => "ಯ", + ["Ra"] => "ಱ", + "la" => ["ಲ"], + ["La"] => ["ಳ", "\u{0CE2}"], + ['mma'] => ["ಮ್ಮ"], + [['tha']] => 'ತ್ತ', + ["va","wa"] => "ವ", + ["sha", ["sa"]] => "ಶ", + ["Sha"] => "ಷ", + "sa" => "ಸ", + "ha" => "ಹ", + ["ksha", "Ksha"] => "ಕ್ಷ", + # ["tra", "Tra"] => "ಱ", + ["`jn"] => "ಜ್ಞ", + ["x"] => "ಕ್ಸ್", + ["za"] => "ಜ\u{0CBC}", + ["rra"] => "\u{0CB0}#{$virama}\u{0CB0}" + +infer_dead_consonants false + +consonants ["na"] => "ನ", + ["ma"] => "ಮ", + "ra" => "ರ" +infer_dead_consonants true + +generate_cv + + +tag "chill" do + consonants "m" => ["ಂ","ಮ"] + consonants "n" => ["ಂ","ನ\u0CCD"] + consonants "r" => ["ರ\u0CCD"] +end + +symbols({:priority => :low}, ["aan", "aam"] => $an) + + + +numbers "0" => "೦", + "1" => "೧", + "2" => "೨", + "3" => "೩", + "4" => "೪", + "5" => "೫", + "6" => "೬", + "7" => "೭", + "8" => "೮", + "9" => "೯" diff --git a/govarnam/schemes/schemes/ml-inscript/ml-inscript.scheme b/govarnam/schemes/schemes/ml-inscript/ml-inscript.scheme new file mode 100644 index 0000000..8a12099 --- /dev/null +++ b/govarnam/schemes/schemes/ml-inscript/ml-inscript.scheme @@ -0,0 +1,119 @@ +# encoding: utf-8 + +## +# +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "ml" +identifier "ml-inscript" +display_name "Malayalam Enhanced Inscript" +author "Navaneet KN, Mujeeb B Positive, Subin Siby" + +infer_dead_consonants false + +$zwnj = "\u{200c}" +$zwj = "\u{200d}" + + +vowels "D" => "അ", + "E" => "ആ", + "F" => "ഇ", + "R" => "ഈ", + "G" => "ഉ", + "T" => "ഊ", + "+" => "ഋ", + "Z" => "എ", + "S" => "ഏ", + "W" => "ഐ", + "~" => "ഒ", + "A" => "ഓ", + "Q" => "ഔ", + "q" => ["ൗ", "ൗ"], + "w" => ["ൈ", "ൈ"], + "e" => ["ാ", "ാ"], + "r" => ["ീ", "ീ"], + "t" => ["ൂ", "ൂ"], + "a" => ["ോ", "ോ"], + "s" => ["േ", "േ"], + "d" => ["്", "്"], + "f" => ["ി", "ി"], + "g" => ["ു", "ു"], + "`" => ["ൊ", "ൊ"], + "z" => ["െ", "െ"], + "x" => ["ം", "ം"] + +consonants "k" => "ക", + "K" => "ഖ", + "i" => "ഗ", + "I" => "ഘ", + "U" => "ങ", + ";" => "ച", + ":" => "ഛ", + "p" => "ജ", + "P" => "ഝ", + "}" => "ഞ", + "'" => "ട", + "\"" => "ഠ", + "[" => "ഡ", + "{" => "ഢ", + "C" => "ണ", + "L" => "ഥ", + "l" => "ത", + "O" => "ധ", + "o" => "ദ", + "v" => "ന", + "h" => "പ", + "H" => "ഫ", + "y" => "ബ", + "Y" => "ഭ", + "c" => "മ", + "/" => "യ", + "j" => "ര", + "n" => "ല", + "b" => "വ", + "M" => "ശ", + "<" => "ഷ", + "m" => "സ", + "u" => "ഹ", + "N" => "ള", + "B" => "ഴ", + "J" => "റ", + "#" => "്ര", + "&" => "ക്ഷ", + "=" => "ൃ" + +numbers "|1" => "൧", + "|2" => "൨", + "|3" => "൩", + "|4" => "൪", + "|5" => "൫", + "|6" => "൬", + "|7" => "൭", + "|8" => "൮", + "|9" => "൯", + "|0" => "൦" + +symbols "_" => "ഃ" + +# value1 = atomic chil. value2 = old style. value3 = base letter +tag "chill" do + consonants "V" => ["ൻ", "ന്‍", "ന"], + "X" => ["ൺ", "ണ്‍", "ണ"], + ">" => ["ൽ", "ല്‍", "ല"], + "*" => ["ൾ", "ള്‍", "ള"], + "\\" => ["ർ", "ര്‍", "ര"] +end + +joiner "^1" => $zwj # caret 1 +non_joiner "^2" => $zwnj # caret 2 + +others "^4" => "₹" +others "|;" => ";" +others "|'" => "'" +others "|\"" => "\"" +others "|-" => "-" +others "|>" => ">" +others "|*" => "*" +others "|\\" => "\\" diff --git a/govarnam/schemes/schemes/ml/ml-basic/pack.json b/govarnam/schemes/schemes/ml/ml-basic/pack.json new file mode 100644 index 0000000..38c46ea --- /dev/null +++ b/govarnam/schemes/schemes/ml/ml-basic/pack.json @@ -0,0 +1,58 @@ +{ + "identifier": "ml-basic", + "name": "Malayalam Basic", + "description": "Words sourced from Malayalam Wikipedia. Each file has maximum 30,000 words.", + "lang": "ml", + "pages_count": 8, + "total_words": 214617, + "pages": [ + { + "identifier": "ml-basic-1", + "page": 1, + "description": "Words with confidence lesser than 266", + "size": 1788238 + }, + { + "identifier": "ml-basic-2", + "page": 2, + "description": "Words with confidence lesser than 222", + "size": 1881604 + }, + { + "identifier": "ml-basic-3", + "page": 3, + "description": "Words with confidence lesser than 188", + "size": 1915021 + }, + { + "identifier": "ml-basic-4", + "page": 4, + "description": "Words with confidence lesser than 154", + "size": 1944241 + }, + { + "identifier": "ml-basic-5", + "page": 5, + "description": "Words with confidence lesser than 121", + "size": 1963814 + }, + { + "identifier": "ml-basic-6", + "page": 6, + "description": "Words with confidence lesser than 89", + "size": 1969192 + }, + { + "identifier": "ml-basic-7", + "page": 7, + "description": "Words with confidence lesser than 54", + "size": 1967755 + }, + { + "identifier": "ml-basic-8", + "size": 309487, + "description": "Words with confidence lesser than 22", + "page": 8 + } + ] +} \ No newline at end of file diff --git a/govarnam/schemes/schemes/ml/ml-basic/word-frequency-report.txt b/govarnam/schemes/schemes/ml/ml-basic/word-frequency-report.txt new file mode 100644 index 0000000..e169fed --- /dev/null +++ b/govarnam/schemes/schemes/ml/ml-basic/word-frequency-report.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75947746747f925bea02ed03d14fa57b1883930907a2fecf77e2468484f67cee +size 8169491 diff --git a/govarnam/schemes/schemes/ml/ml-english/pack.json b/govarnam/schemes/schemes/ml/ml-english/pack.json new file mode 100644 index 0000000..cfa16fd --- /dev/null +++ b/govarnam/schemes/schemes/ml/ml-english/pack.json @@ -0,0 +1,16 @@ +{ + "identifier": "ml-english", + "name": "English words in Malayalam", + "description": "Words like india => ഇന്ത്യ, scene => സീൻ etc.", + "lang": "ml", + "pages_count": 1, + "total_words": 3039, + "pages": [ + { + "identifier": "ml-english-1", + "page": 1, + "description": "Words with confidence lesser than 44", + "size": 144837 + } + ] +} \ No newline at end of file diff --git a/govarnam/schemes/schemes/ml/ml-english/patterns.txt b/govarnam/schemes/schemes/ml/ml-english/patterns.txt new file mode 100644 index 0000000..0de85a9 --- /dev/null +++ b/govarnam/schemes/schemes/ml/ml-english/patterns.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd7a3e0824d687fd25454dbc0ff8ab33ef8166eeb81d6b77b579ff5f1c57a261 +size 46186 diff --git a/govarnam/schemes/schemes/ml/ml.scheme b/govarnam/schemes/schemes/ml/ml.scheme new file mode 100644 index 0000000..e67e0d3 --- /dev/null +++ b/govarnam/schemes/schemes/ml/ml.scheme @@ -0,0 +1,461 @@ +#!/usr/bin/env ruby +# encoding: utf-8 + +## +# Copyright (C) Navaneeth.K.N +# +# This is part of libvarnam. See LICENSE.txt for the license +# +# Copyright (C) Subin Siby, 2021 +## + +language_code "ml" +identifier "ml" +display_name "Malayalam" +author "Navaneeth KN" +stable true + +$virama = "്" +virama "~" => "്" + +# zwj and zwnj are __ & _ by default in all schemes in varnam +# unless explicitly override in scheme + +infer_dead_consonants true + +vowels "a" => "അ", + [["a"], "aa", "A"] => ["ആ", "ാ"], + "i" => ["ഇ", "ി"], + ["ee", "I", "ii", ["i"]] => ["ഈ", "ീ"], + "u" => ["ഉ", "ു"], + [["u"], "uu", "oo", "U"] => ["ഊ", "ൂ"], + [["ri", "ru"], "R"] => ["ഋ", "ൃ", "ർ"], + "e" => ["എ", "െ"], + ["E", ["e"]] => ["ഏ", "േ"], + ["ai", "ei"] => ["ഐ", "ൈ"], + "o" => ["ഒ", "ൊ"], + ["O", ["o"]] => ["ഓ", "ോ"], + ["ou", "au", "ow"] => ["ഔ", "ൗ"], + ["OU", "AU", "OW"] => ["ഔ", "ൌ"] + +# Scheme Change July 25, 2021: ou & OU swapped +# By Subin Siby + +vowels({:priority => :low, :accept_if => :ends_with}, [["y"]] => ["ഇ", "ി"]) + +consonants ["ka"] => "ക", + ["kha", ["gha"]] => "ഖ", + "ga" => "ഗ", + ["gha", ["kha"]] => "ഘ", + "cha" => "ച", + ["CHa", ["cha", "jha"]] => "ഛ", + [["cha"]] => "ച്ഛ", + "ja" => "ജ", + ["jha", "JHa"] => "ഝ", + [["nja"], "NJa"] => "ഞ്ഞ", + ["ta", ["tta"]] => "റ്റ", + [["da", "ta"], "Ta"] => "ട", + [["da", "ta"], "TTa"] => "ഠ", + ["Da", ["da"]] => "ഡ", + [["da"], "DDa"] => "ഢ", + ["tha"] => "ത", + [["tha", "dha"], "thha"] => "ഥ", + [["tha", "dha"], "tathha"] => "ത്ഥ", + "da" => "ദ", + [["dha"], "ddha"] => "ദ്ധ", + "dha" => "ധ", + "pa" => "പ", + ["pha", "fa", "Fa"] => "ഫ", + "ba" => "ബ", + "bha" => "ഭ", + ["va", "wa"] => "വ", + ["Sa", ["sha", "sa"]] => "ശ", + ["sa", "za"] => "സ", + "ha" => "ഹ" + +consonants({:accept_if => :starts_with}, ["ca"] => "ക") + +# Scheme Change August 23, 2021: nga => ങ്ങ (double ങ) +# By Subin Siby +consonants "nja" => ["ഞ", "ഞ്ഞ"], + + # Double ങ (ങ്ങ) has more usage than single + "nga" => ["ങ്ങ", "ങ"], + "NGa" => "ങ" + +consonants ["kra"] => "ക്ര", + "gra" => "ഗ്ര", + ["ghra", ["khra"]] => "ഘ്ര", + ["CHra", ["chra", "jhra"]] => "ഛ്ര", + "jra" => "ജ്ര", + [["dra", "tra"], "Tra"] => "ട്ര", + ["Dra", ["dra"]] => "ഡ്ര", + "Dhra" => "ഢ്ര", + ["thra", ["tra"]] => "ത്ര", + "dra" => "ദ്ര", + ["ddhra", ["dhra"]] => "ദ്ധ്ര", + "dhra" => "ധ്ര", + "pra" => "പ്ര", + ["phra", "fra", "Fra"] => "ഫ്ര", + "bra" => "ബ്ര", + "bhra" => "ഭ്ര", + ["vra", "wra"] => "വ്ര", + ["Sra", ["shra", "sra"]] => "ശ്ര", + "shra" => "ഷ്ര", + ["sra", "zra"] => "സ്ര", + "hra" => "ഹ്ര", + "nthra" => "ന്ത്ര", + [["ndra", "ntra"], "nDra", "Ntra", "nTra"] => "ണ്ട്ര", + "ndra" => "ന്ദ്ര", + [["thra"], "THra", "tthra"] => "ത്ത്ര", + "nnra" => "ന്ന്ര", + ["kkra", "Kra", "Cra"] => "ക്ക്ര", + ["mpra", "mbra"] => "മ്പ്ര", + ["skra","schra"] => "സ്ക്ര", + "ndhra" => "ന്ധ്ര", + "nmra" => "ന്മ്ര", + ["NDra", ["ndra"]] => "ണ്ഡ്ര" + +consonants({:accept_if => :starts_with}, ["cra"] => "ക്ര") + +consonants "ya" => "യ", + "sha" => "ഷ", + "zha" => "ഴ", + ["xa", ["Xa"]] => "ക്സ", + "ksha" => "ക്ഷ", + "nka" => "ങ്ക", + ["ncha", ["nja"]] => "ഞ്ച", + "ntha" => "ന്ത", + "nta" => "ന്റ", + [["nda"], "nDa", "Nta"] => "ണ്ട", + "nda" => "ന്ദ", + "tta" => "ട്ട", + [["tha"], "THa", "ttha"] => "ത്ത", + "lla" => "ല്ല", + ["LLa", ["lla"]] => "ള്ള", + "nna" => "ന്ന", + ["NNa", ["nna"]] => "ണ്ണ", + ["bba", "Ba"] => "ബ്ബ", + ["kka", "Ka"] => "ക്ക", + ["gga", "Ga"] => "ഗ്ഗ", + ["jja", "Ja"] => "ജ്ജ", + ["mma", "Ma"] => "മ്മ", + ["ppa", "Pa"] => "പ്പ", + ["vva", "Va", "wwa", "Wa"] => "വ്വ", + ["yya", "Ya"] => "യ്യ", + ["mpa", "mba"] => "മ്പ", + ["ska","scha"] => "സ്ക", + [["cha"], "chcha", "ccha", "Cha"] => "ച്ച", + "ndha" => "ന്ധ", + "jnja" => "ജ്ഞ", + "nma" => "ന്മ", + ["Nma", ["nma"]] => "ണ്മ", + ["nJa", ["nja"]] => "ഞ്ജ", + ["NDa", ["nda"]] => "ണ്ഡ" + +# Don't need to infer dead consonants because removing 'a' from pattern will make what is mapped for chill +infer_dead_consonants false + +consonants ["ra"] => "ര", + [["ra"], "Ra"] => "റ", + ["na"] => "ന", + [["na"], "Na"] => "ണ", + ["la"] => "ല", + [["la"], "La"] => "ള" + +infer_dead_consonants true + +consonants ["rva", "rwa"] => "ര്വ", + "rya" => "ര്യ", + ["Rva", "Rwa", ["rva"]] => "റ്വ്", + ["Rya", ["rya"]] => "റ്യ്", + ["nva", "nwa"] => "ന്വ", + "nya" => "ന്യ", + ["Nva", "Nwa", ["nva", "nwa"]] => "ണ്വ", + ["Nya", ["nya"]] => "ണ്യ", + ["lva", "lwa"] => "ല്വ", + "lya" => "ല്യ", + ["Lva", "Lwa", ["lva", "lwa"]] => "ള്വ", + ["Lya", ["lya"]] => "ള്യ" + +# BEGIN Anusvara <-> ma complications + +anusvara [["m"]] => ["ം","ം","മ"] +anusvara "m_" => ["ം","ം","മ"] +anusvara({:accept_if => :ends_with}, "m" => ["ം","ം","മ"]) +anusvara({:accept_if => :in_between}, "m" => ["ം","ം","മ"]) + +consonants ["ma"] => "മ" + +# END Anusvara <-> ma complications + +# Autogenerate consonant vowel combinations +generate_cv + +delete_token( + value1: "മ്" +) +consonants({:accept_if => :starts_with}, "m" => "മ്") + +consonants [["ru"]] => "ര്", + [["r~", "ru"]] => "റ്", + [["nu"]] => "ന്", + [["n~", "nu"]] => "ണ്", + [["lu"]] => "ല്", + [["l~", "lu"]] => "ള്", + ["r~"] => "ര്", + ["R~"] => "റ്", + ["n~"] => "ന്", + ["N~"] => "ണ്", + ["l~"] => "ല്", + ["L~"] => "ള്" + +# value1 = atomic chil. value2 = old style. value3 = base letter +tag "chill" do + consonants "n" => ["ൻ", "ന്‍", "ന"], + ["N", ["n"]] => ["ൺ", "ണ്‍", "ണ"], + "l" => ["ൽ", "ല്‍", "ല"], + ["L", ["l"]] => ["ൾ", "ള്‍", "ള"], + ["r"] => ["ർ", "ര്‍", "ര"] +end + +# START July 7, 2021 change +# https://gitlab.com/subins2000/govarnam/-/issues/2 + +tag "chill" do + consonants( + { + :accept_if => :in_between, + :match_type => :match_exact + }, + combine(get_chill, ["*_"] => ["*1", "*2", "*3"]) + ) +end + +# END July 7, 2021 change + +# * is the place holder which will be replaced by the actual value +consonants({:accept_if => :ends_with}, combine(get_dead_consonants, [["*u"]] => "*1")) + +symbols "/" => "ഽ", + ["H", [":"]] => "ഃ" + +numbers "0" => "൦", + "1" => "൧", + "2" => "൨", + "3" => "൩", + "4" => "൪", + "5" => "൫", + "6" => "൬", + "7" => "൭", + "8" => "൮", + "9" => "൯" + +stemrules "ാ" => "്", +"ും" => "്", +"ണു" => "ണ്", +"ന്റെ" => "ൻ", +"ൻറെ" => "ൻ", +"ന്റേ" => "ൻ", +"ന്" => "ൻ", +"ിനു" => "ിൻ", +"നേ" => "ൻ", +"നെ" => "ൻ", +"നു" => "ൻ", +"ള്" => "ൾ", +"ളു" => "ൾ", +"ളെ" => "ൾ", +"ളേ" => "ൾ", +"ളോ" => "ൾ", +"ളൂ" => "ൾ", +"ല്" => "ൽ", +"ലെ" => "ൽ", +"ലേ" => "ൽ", +"ലോ" => "ൽ", +"ലു" => "ൽ", +"ലൂ" => "ൽ", +"റു" => "ർ", +"രെ" => "ർ", +"രേ" => "ർ", +"ര്" => "ർ", +"രു" => "ർ", +"രൊ" => "ർ", +"വ്" => "ം", +"മ്" => "ം", +"മു" => "ം", +"മേ" => "ം", +"മോ" => "ം", +"മൊ" => "ം", +"ത്ത്" => "ം", +"ത്തേ" => "ം", +"തോ" => "ത്", +"ച്ചേ" => "ച്ചു", +"യ്" => "", +"യു" => "", +"യെ" => "", +"യേ" => "", +"യി" => "", +"യോ" => "", +"ടെ" => "", +"ടേ" => "", +"ടോ" => "ട്", +"ക്ക്" => "", +"ക്കു" => "", +"ക്കെ" => "", +"ല്ലാം" => "", +"ല്ലോ" => "", +"ണ്ടു" => "ണ്ട്", +"ണ്ടോ" => "ണ്ട്", +"ണ്ടിൽ" => "ണ്ട്", +"ളിൽ" => "ൾ", +"മായ" => "ം", +"മായി" => "ം", +"മാക്കി" => "ം", +"മാക്കും" => "ം", +"മാണ്" => "ം", +"മുണ്ട്" => "ം", +"മെന്ന" => "ം", +"മെന്ന്" => "ം", +"മിൻ" => "ം", +"മാർ" => "", +"ണെന്നു" => "ണ്", +"ണെന്ന്" => "ണ്", +"ണെന്ന" => "ണ്", +#"ന്ൻ" => "ന്ന്", +"ന്നോ" => "ന്ന്", +"ന്നല്ല" => "ന്ന്", +"ന്നാണ്‌" => "ന്ന്", +"ന്നാക്കി" => "ന്ന്", +"ന്നത്" => "ന്ന", +"ന്നതു" => "ന്ന", +"ന്നുണ്ട്" => "ന്ന", +"നിന്ന്" => "", +"നായ" => "ൻ", +"നോടു"=> "ൻ", +"ട്ടില്ല" => "ട്ട്", +#"ളുടെ" => "ൾ", +"ൾക്ക്" => "ൾ", +"ൾക്കും" => "ൾ", +"ളാണ്" => "ൾ", +"ളായ" => "ൾ", +"ളോട്" => "ൾ", +"ളോളം" => "ൾ", +"ളുണ്ട്" => "ൾ", +"ളാൽ"=> "ൾ", +"റിൽ" => "ർ", +"രായ" => "ർ", +"രോട്" => "ർ", +"വായ" => "വ്", +"ങ്ങൾ" => "ം", +"ത്തിൽ" => "ം", +"ത്തിൻ" => "ം", +"ത്തായ" => "ത്ത്", +"ത്തുന്നു" => "ന്ന", +"ത്തുന്ന" => "ന്ന", +"ത്തുക" => "ന്ന", +"ത്തെ" => "ം", +"ത്തോ" => "ം", +"വുന്ന" => "ം", +"ലേക്ക്" => "ൽ", +"ലിൽ" => "ൽ", +"ലാണ്" => "ൽ", +"ലായ" => "ൽ", +"ലുളള" => "ൽ", +"ലാക്കി" => "ൽ", +"ല്ലെന്നു" => "ല്ല", +"ല്ലെന്ന്" => "ല്ല", +"ല്ലാത്ത" => "ല്ല", +"ല്ലെന്ന" => "ല്ല", +"ലുണ്ട്" => "ൽ", +"രുടെ" => "ർ", +"രാണ്" => "ർ", +"രിൽ" => "ർ", +"വിൽ" => "വ്", +"യവ" => "യ", +"യിരി" =>"ണ്‌", +"കൾ‍" => "", +"ാതെ" => "", +"യായ" => "", +"യുള്ളൂ" => "", +"യുള്ളു" => "", +"യാണോ" => "", +"യായി" => "", +"യെന്ന" => "", +"യാണ്" => "", +"യത്" => "യ", +"യല്ല" => "", +"യുണ്ട്" => "", +"യില്ല" => "", +"യുടെ" => "", +"യെന്നു" => "", +"യിൽ" => "", +"കൾ" => "", +"ക്കൾ" => "", +"ക്കാൾ" => "", +"ക്കുന്നു" => "", +"തന്നെ" => "", +"ത്തന്നെ" => "", +"ക്കാൻ" => "ക്കുക", +"ക്കേണ്ടി" => "ക്കുക", +"ക്കൊണ്ട്" => "", +"കൊണ്ട്" => "", +"കൊണ്ടു" => "", +"ക്കൊണ്ടു" => "", +"ക്കാണ്" => "", +"ക്കിൽ" => "ക്ക്", +"െന്ന്" => "്", +"ോളം" => "്", +"ാക്" => "്", +"ാക്കി" => "്", +"ിരുന്നു" => "്", +"ിരുന്ന" =>"്", +"ാണ്" => "്", +"ിത്" => "്", +#"ിൽ" => "്", +"ില്ല" => "ിൽ" , +"ുള്ള" => "്", +"ുള്ളത്" => "്", +"ുണ്ട്" => "്", +"ിയ" => "ി", +"ച്ചത്" => "ച്ചു", +"പ്പിൽ" => "പ്പ്", +"പ്പിച്ച" => "ച്ച", +"താണ്‌" => "ത്", +"തല്ല" => "ത്", +"തിൻ" => "ത്", +"ഴാണ്‌" => "ഴ്", +"വാണ്‌" => "വ്", +"യിരിക്കുന്ന" => "", +"യിരിക്കുന്നു" => "", +"യിരുന്നത്" => "", +"യിരുന്ന" => "", +"യിരുന്നു" => "", +"യാകട്ടെ" => "", +"ണ്ടായി" => "", +"ണ്ടായ" => "", +"പ്പെടുന്നു" => "ുന്ന", +"ിട്ടുണ്ട്" => "്", +"െങ്കിൽ" => "്", +"പ്പിക്കുക" => "പ്പിച്ചു", +"ക്കപ്പെട്ട" => "ക്കുക", +"ക്കുവാൻ" => "ക്കുക" + +exceptions_stem "ന്" => "ന്", + "നെ" => "ന്", + "നു" => "ന്", + "കൾ" => "ക്", + "കൊണ്ടു" => "ക്", + "കൊണ്ട്" => "ക്", + "ലോ" => "ല്", + "ല്ല" => "ല്", + "ടെ" => "ട്", + "തോ" => "ത്", + "താണ്‌" => "ത്", + "തായ" => "ത്", + "തിൻ" => "ത്", + "തന്നെ" => "ത്", + "വ്" => "ി", + "ിക്കുന്ന" => "ര", + "ുണ്ട്" => "ന്ന", + "ളു" => "ള്", + "ളി" => "ള്" diff --git a/govarnam/schemes/schemes/ml/symbol-frequency-report.txt b/govarnam/schemes/schemes/ml/symbol-frequency-report.txt new file mode 100644 index 0000000..ba39a78 --- /dev/null +++ b/govarnam/schemes/schemes/ml/symbol-frequency-report.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d24c8a04d3bf8f54c6236de7c9fa002e37eaf6d27a8e3bacab8844262aec76d +size 13049 diff --git a/govarnam/schemes/schemes/mr/mr.scheme b/govarnam/schemes/schemes/mr/mr.scheme new file mode 100644 index 0000000..3f39183 --- /dev/null +++ b/govarnam/schemes/schemes/mr/mr.scheme @@ -0,0 +1,297 @@ +# encoding: utf-8 + +## +# Copyright (C) Manish C Pillewar +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "mr" +identifier "mr" +display_name "Marathi" +author "Manish C Pillewar" +#stable true + +# +# since Marathi uses Devnagri script, this file is very similar to Hindi mapping file +# except some specific vowel sounds +# + +ignore_duplicates false + +$virama = "\u{094D}" +$am = "\u{0902}" +$an = "\u{0901}" +$nukta = "\u{093c}" + +virama "~" => "\u{094d}" + +visarga "H" => "\u{0903}" + +vowels "a" => "अ", + ["aa", "A"] => ["आ", "ा"], + ["AAA", ["aaa", "a"]] => ["आऽऽ", "ाऽऽ"], + ["ae", ["a"]] => "ॲ", + ["i",["e"]] => ["इ", "ि"], + ["ee", "I", "ii"] => ["ई", "ी"], + "u" => ["उ", "ु"], + ["uu", "U", ["u"]] => ["ऊ", "ू"], + ["RR", "Ru", ["ru", "ri"]] => ["ऋ", "\u{0943}"], + ["RRu", ["RR", "Ru"]] => ["ॠ", "\u{0944}"], + ["LR", ["Lr", "lR"]] => ["ऌ", "\u{0962}"], + ["LLR", ["LR"]] => ["ॡ", "\u{0963}"], + ["Ae"] => ["ऍ", "\u{0945}"], + ["e", ["Ae"]] => ["ए", "े"], + "ea" => ["एऽ", "ेऽ"], + ["ai", "ei","E", ["e"]] => ["ऐ", "ै"], + ["ay",["ai"]] => ["\u{090E}","\u{0946}"], + "o" => ["ऒ", "\u{094A}"], + "oa" => ["ओऽ", "ोऽ"], + ["O", ["o"]] => ["ओ", "\u{094B}"], + ["ou", "au", ["ow"]] => ["औ", "ौ"], + ["aw", ["ou", "au"]] => ["ऑ", "ॉ"], + ["N", "M"] => ["अं" ,"ं"], + "aH" => ["अः", "ः"] + +list :can_make_cluster do +consonants "k" => "क", + ["q", ["k"]] => ["\u{0958}", "क#{$nukta}"], + "kh" => "ख", + ["Kh", ["kh"]] => ["\u{0959}", "ख#{$nukta}"], + "g" => "ग", + ["G"] => ["\u{095A}", "ग#{$nukta}"], + ["gh",["g"]] => "घ", + "ng" => "ङ", + + "ch" => "च", + ["Ch", ["ch"]] => "छ", + "j" => "ज", + "z" => "ज#{$nukta}", + ["jh", ["j"]] => "झ", + "nj" => "ञ", + + "T" => "ट", + ["Th", ["T"]] => "ठ", + "D" => "ड", + ["dd", ["D"]] => ["\u{095C}", "ड#{$nukta}"], + "Dh" => "ढ", + ["DH", ["Dh"]] => ["\u{095D}", "ढ#{$nukta}"], + ["NN", ["n"]] => "ण", + + "t" => "त", + ["th", ["t"]] => "थ", + "d" => "द", + ["dh", ["d"]] => "ध", + "n" => "न", + ["NNN", ["n"]] => ["ऩ", "न#{$nukta}"], + + ["p"] => "प", + ["ph",["f"]] => "फ", + [["ph"], "f"] => ["\u{095E}", "फ#{$nukta}"], + "b" => "ब", + ["bh", ["b"]] => "भ", + "m" => "म", + + "y" => "य", + ["Y", ["y"]] => ["\u{095F}", "य#{$nukta}"], + "r" => ["र", "र#{$virama}"], + ["R", ["r"]] => ["ऱ","र#{$nukta}"], + "l" => ["ल", "ल#{$virama}"], + ["L"] => "ळ", + ["LL", ["L"]] => ["\u0934","ळ#{$nukta}"], + ["v", "w"] => "व", + "s" => "स", + ["sh", ["s"]] => "श", + ["Sh", ["sh"]] => "ष", + "h" => "ह" +end + +ignore_duplicates true +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + end +end + + +# conjuncts for more than 2 consonants +consonants "kkN" => "क्क्ण", + "kky" => "क्क्य", + "kty" => "क्त्य", + "ktr" => "क्त्र", + ["ktv", "ktw"] => "क्त्व", + "kpr" => "क्प्र", + "kthn" => "क्थ्न", + "kny" => "क्न्य", + "kShN" => "क्ष्ण", + "kShm" => "क्ष्म", + "kShy" => "क्ष्य", + ["kShv", "kShw"] => "क्ष्व", + ["ksv", "ksw"] => "क्स्व", + "gjy" => "ग्ज्य", + ["gdhv", "gdhw"] => "ग्ध्व", + "gny" => "ग्न्य", + "gbhy" => "ग्ब्य", + "gry" => "ग्र्य", + ["ghny", ["gny"]] => "घ्न्य", + "ngkt" => "ङ्क्त", + "ngkSh" => "ङ्क्ष", + "ngkhy" => "ङ्ख्य", + "nggy" => "ङ्ग्य", + "nggr" => "ङ्ग्र", + "ngghy" => "ङ्घ्य", + "ngghr" => "ङ्घ्र", + "chchy" => "च्च्य", + "chChr" => "च्छ्र", + ["chChv", "chChw"] => "च्छ्व", + "jjny" => "ज्ज्ञ", + "jjy" => "ज्ज्य", + ["jjv", "jjw"] => "ज्ज्व", + "jnyy" => "ज्ञ्य", + "nyjny" => "ञ्ज्ञ", + "nyshr" => "ञ्श्र", + ["nyshv", "nyshw"] => "ञ्श्व", + "NDy" => "ण्ड्य", + "NDr" => "ण्ड्र", + "tkr" => "त्क्र", + "tkSh" => "त्क्ष", + "tty" => "त्त्य", + "ttr" => "त्त्र", + ["ttv", "ttw"] => "त्त्व", + "tny" => "त्न्य", + "tpr" => "त्प्र", + "tmy" => "त्म्य", + "try" => "त्र्य", + ["tvy", "twy"] => "त्व्य", + "tst" => "त्स्त", + "tsth" => "त्स्थ", + "tsn" => "त्स्न", + "tsm" => "त्स्म", + "tsy" => "त्स्य", + "tsr" => "त्स्र", + ["tsv", "tsw"] => "त्स्व", + "dgr" => "द्ग्र", + "ddy" => "द्द्य", + "ddr" => "द्द्र", + ["ddv", "ddw"] => "द्द्व", + "ddhy" => "द्ध्य", + "ddhr" => "द्ध्र", + ["ddhv", "ddhw"] => "द्ध्व", + "dbr" => "द्ब्र", + "dbh" => "द्ब्ह", + "dbhy" => "द्ब्य", + "dbhr" => "द्ब्र", + "dry" => "द्र्य", + ["dvy", "dwy"] => "द्व्य", + ["dvr", "dwr"] => "द्व्र", + "dhny" => "ध्न्य", + "dhry" => "ध्र्य", + "nkr" => "न्क्र", + "nkl" => "न्क्ल", + "nkSh" => "न्क्ष", + "nty" => "न्त्य", + "ntr" => "न्त्र", + ["ntv", "ntw"] => "न्त्व", + "ndy" => "न्द्य", + "ndr" => "न्द्र", + ["ndv", "ndw"] => "न्द्व", + "ndhy" => "न्ध्य", + "ndhr" => "न्ध्र", + "nny" => "न्न्य", + ["nnv", "nnw"] => "न्न्व", + "npr" => "न्प्र", + "nbr" => "न्ब्र", + "nbhr" => "न्भ्र", + ["nvy", "nwy"] => "न्व्य", + "nsth" => "न्स्थ", + "nsm" => "न्स्म", + ["nsv", "nsw"] => "न्स्व", + "pty" => "प्त्य", + ["ptv", "ptw"] => "प्त्व", + "psy" => "प्स्य", + ["psv", "psw"] => "प्स्व", + "bdhv" => "ब्ध्व", + "mby" => "म्ब्य", + "mpr" => "म्प्र", + "rkSh" => "र्क्ष", + "rgy" => "र्ग्य", + "rgr" => "र्ग्र", + "rghy" => "र्घ्य", + "rngg" => "र्ङ्ग", + "rchCh" => "र्च्छ", + "rchy" => "र्च्य", + "rjny" => "र्ज्ञ", + ["rjv", "rjw"] => "र्ज्व", + "rNy" => "र्ण्य", + "rtm" => "र्त्म", + "rty" => "र्त्य", + "rtr" => "र्त्र", + ["rtv", "rtw"] => "र्त्व", + "rts" => "र्त्स", + "rtt" => "र्त्त", + "rthy" => "र्थ्य", + "rddh" => "र्द्ध", + "rdy" => "र्द्य", + "rdr" => "र्द्र", + ["rdv", "rdw"] => "र्द्व", + "rdhn" => "र्ध्न", + "rdhm" => "र्ध्म", + "rdhy" => "र्ध्य", + "rdhr" => "र्ध्र", + ["rdhv", "rdhw"] => "र्ध्व", + "rny" => "र्न्य", + "rbr" => "र्ब्र", + "rbhy" => "र्भ्य", + "rbhr" => "र्भ्र", + "rmy" => "र्म्य", + ["rvy", "rwy"] => "र्व्य", + ["rShv", "rShw"] => "र्ष्व", + "rShT" => "र्ष्ट", + "rShN" => "र्ष्ण", + "rShy" => "र्ष्य", + "rhy" => "र्ह्य", + "rhr" => "र्ह्र", + "lky" => "ल्क्य", + ["shchy", "shcy"] => "श्च्य", + "shry" => "श्र्य", + ["shvy", "shwy"] => "श्व्य", + "Shkr" => "ष्क्र", + "ShTy" => "ष्ट्य", + "ShTr" => "ष्ट्र", + ["ShTv", "ShTw"] => "ष्ट्व", + "ShThy" => "ष्ठ्य", + "Shthr" => "ष्थ्र", + "Shthy" => "ष्थ्य", + "ShNy" => "ष्ण्य", + "Shpr" => "ष्प्र", + "sty" => "स्त्य", + "str" => "स्त्र", + ["stv", "stw"] => "स्त्व", + "sthy" => "स्थ्य", + "smy" => "स्म्य", + "sny" => "स्न्य", + ["ssv", "ssw"] => "स्स्व" + +generate_cv + +consonants(combine get_consonants, ["*a"] => ["*1"]) + + +symbols ["om"] => "ॐ" + +symbols({:priority => :low}, [["aan", "aam"]] => $an) + +numbers "0" => "०", + "1" => "१", + "2" => "२", + "3" => "३", + "4" => "४", + "5" => "५", + "6" => "६", + "7" => "७", + "8" => "८", + "9" => "९" diff --git a/govarnam/schemes/schemes/ne/ne.scheme b/govarnam/schemes/schemes/ne/ne.scheme new file mode 100644 index 0000000..3520cdd --- /dev/null +++ b/govarnam/schemes/schemes/ne/ne.scheme @@ -0,0 +1,202 @@ +# encoding: utf-8 + +## +# Copyright (C) Prateek Kumar Baheti +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "ne" +identifier "ne" +display_name "Nepali" +author "Rohit Bansal" + +ignore_duplicates false + +$virama = "\u{094D}" +$am = "\u{0902}" +$an = "\u{0901}" +$nukta = "\u{093C}" + +virama "~" => "\u{094D}" + +vowels "a" => "अ", + ["aa", "A", ["a"]] => ["आ", "ा"], + ["AA", ["aa"]] => ["आऽऽ", "ाऽऽ"], + ["ae", ["a"]] => ["ऍ", "\u{0945}"], + "i" => ["इ", "ि"], + ["ee", "I", "ii", ["i"]] => ["ई", "ी"], + "u" => ["उ", "ु"], + ["uu", "oo", "U", ["u"]] => ["ऊ", "ू"], + + ["R", ["ri", "ru", "r"]] => ["ऋ", "\u{0943}"], + ["RR", ["R", "rr", "rri"]] => ["ॠ", "ॄ"], + ["LR", ["lri", "lru"]] => ["ऌ", "ॢ"], + ["LRR", ["LR", "lrri", "lrru"]] => ["ॡ", "ॣ"], + + "e" => ["ए", "े"], + "ea" => ["एऽ", "ेऽ"], + ["eA", ["ea"]] => ["एऽऽ", "ेऽऽ"], + ["ai", "ei"] => ["ऐ", "ै"], + ["ay", ["ai"]] => ["ऎ", "ॆ"], + + "o" => ["ऒ", "ॊ"], + ["O", ["o"]] => ["ओ", "ो"], + ["Oa", ["oa"]] => ["ओऽ", "ोऽ"], + ["ou", "au", "ow"] => ["औ", "ौ"], + ["aw", ["au"]] => ["ऑ", "ॉ"], + ["N", "M",] => ["अं" ,"ं"], + ["H"] => ["अः", "ः"] + +list :can_make_cluster do +consonants "k" => "क", + [["k"], "q"] => ["\u{0958}", "क#{$nukta}"], + "kh" => "ख", + ["KH", ["kh"]] => ["\u{0959}", "ख#{$nukta}"], + "g" => "ग", + ["G", ["g"]] => ["\u{095A}", "ग#{$nukta}"], + ["gh", ["g"]] => "घ", + "ng" => "ङ", + + "ch" => "च", + ["CH", ["ch"]] => "छ", + "j" => "ज", + ["z", ["j"]] => ["\u{095B}", "ज#{$nukta}"], + ["jh", ["j"]] => "झ", + "nj" => "ञ", + + "T" => "ट", + ["Th", ["T"]] => "ठ", + "D" => "ड", + ["DD", ["D"]] => ["\u{095C}", "ड#{$nukta}"], + ["Dh", ["D"]] => "ढ", + ["DH", ["Dh"]] => ["\u{095D}", "ढ#{$nukta}"], + "NN" => "ण", + + "t" => "त", + ["th", ["t"]] => "थ", + "d" => "द", + ["dh", ["d"]] => "ध", + "n" => "न", + "NNN" => ["ऩ", "न#{$nukta}"], + + "p" => "प", + ["ph", ["f"]] => "फ", + ["f", ["ph"]] => ["\u{095E}", "फ#{$nukta}"], + "b" => "ब", + "bh" => "भ", + + "y" => "य", + ["Y", ["y"]] => ["\u{095F}", "य#{$nukta}"], + "r" => ["र", "र#{$virama}"], + "RRR" => ["ऱ", "र#{$nukta}"], + "l" => ["ल", "ल#{$virama}"], + ["L", ["l"]] => "ळ", + ["LL", ["L"]] => ["ऴ", "ळ#{$nukta}"], + ["v", "w"] => "व", + "s" => "स", + ["sh", ["s"]] => "श", + ["Sh", ["sh"]] => "ष", + "h" => "ह" +end + +# conjuncts for more than 2 consonants +consonants "kShN" => "क्ष्ण", #tikShN + "kShm" => "क्ष्म", #sukShm + "kShy" => "क्ष्य", + "kShv" => "क्ष्व", + "tkr" => "त्क्र", + "tsn" => "त्स्न", + "tpr" => "त्प्र", + "try" => "त्र्य", + "tsy" => "त्स्य", + "nkt" => "न्क्त", + "nkr" => "न्क्र", + "nkhy" => "न्ख्य", #sankhya + "ngy" => "न्ग्य", #sangya + "ngr" => "न्ग्र", #sangrah + "ngl" => "न्ग्ल", #ungli + "nghr" => "न्घ्र", #ghunghru + "njr" => "न्ज्र", #khanjri + "njhl" => "न्ज्ल", #jhunjhalahat + "ntr" => "न्त्र", + "nDl" => "न्ड्ल", + "nphl" => "न्फ्ल", + "nShy" => "न्ष्य", + "nsk" => "न्स्क", + "nsth" => "न्स्थ", + "nsp" => "न्स्प", + "nsm" => "न्स्म", + "nty" => "न्त्य", + "ndy" => "न्द्य", + "ndhy" => "न्ध्य", + "ndr" => "न्द्र", + "ndhr" => "न्ध्र", + "ndv" => "न्द्व", + "mpr" => "म्प्र", + "mbhr" => "म्भ्र", + "rkhy" => "र्ख्य", + "rghy" => "र्घ्य", + "rjy" => "र्ज्य", + "rty" => "र्त्य", + "rtm" => "र्त्म", + "rdr" => "र्द्र", #Ardra + "rdhv" => "र्ध्व", #oordhva + "rshv" => "र्स्व", #pArshva + "lky" => "ल्क्य", + "str" => "स्त्र", + "sthy" => "स्थ्य", + "spr" => "स्प्र", + "smr" => "स्म्र", + "Shtr" => "ष्त्र", + "Shthy" => "ष्थ्य" + +ignore_duplicates true +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + end +end + +# Half forms for म + +consonants(combine can_make_cluster, ["m*"] => ["म#{$virama}*1"]) + +generate_cv + +consonants(combine get_consonants, ["*a"] => ["*1"]) + +ignore_duplicates true + +# Need to replace this when we have a way to pass value2 and value3 in combine +consonants [["m"]] => "म" + +get_vowels.each do |vowel| + if vowel.match_type == 1 + consonant_vowel_combinations ["m#{vowel.pattern}"] => ["म#{vowel.value2}"] + else + consonant_vowel_combinations [["m#{vowel.pattern}"]] => ["म#{vowel.value2}"] + end +end + +symbols [["n", "m"]] => $an, + ["om"] => "ॐ", + ["?"] => "ॽ", + ["|"] => "।" + +symbols({:priority => :low}, ["aan", "aam"] => $an) + +numbers "0" => "०", + "1" => "१", + "2" => "२", + "3" => "३", + "4" => "४", + "5" => "५", + "6" => "६", + "7" => "७", + "8" => "८", + "9" => "९" diff --git a/govarnam/schemes/schemes/or/or.scheme b/govarnam/schemes/schemes/or/or.scheme new file mode 100644 index 0000000..d06a578 --- /dev/null +++ b/govarnam/schemes/schemes/or/or.scheme @@ -0,0 +1,191 @@ +# encoding: utf-8 + +## +# Copyright (C) Nilakanta Mallick +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "or" +identifier "or" +display_name "Odia" +author "Nilakanta Mallick" + +ignore_duplicates false + +$virama = "\u{0B4D}" +$nukta = "\u{0B3C}" + +$an = "\u{0B01}" +$am = "\u{0B02}" + +virama "~" => "\u{0B4D}" +visarga "H" => "\u{0B03}" +$visarga = "\u{0B03}" + +vowels "a" => "ଅ", + ["aa", "A", ["a"]] => ["ଆ", "ା"], + ["AA", ["aa"]] => ["ଆଽଽ", "ାଽଽ"], + "i" => ["ଇ", "ି"], + ["ee", "I", "ii", ["i"]] => ["ଈ", "ୀ"], + "u" => ["ଉ", "ୁ"], + ["uu", "oo", "U", ["u"]] => ["ଊ", "ୂ"], + + ["Ru", "ru"] => ["ଋ","ୃ"], + ["RRu", ["Rru"]] => ["ୠ", "ୄ"], + ["L", ["li", "lu"]] => ["ଌ","\u{0B62}"], + ["LL", ["LLi", "LLu", "Lli", "Llu"]] => ["ୡ", "\u{0B63}"], + + "e" => ["ଏ", "େ"], + "ea" => ["ଏଽ", "େଽ"], + ["ai", "ei"] => ["ଐ", "\u{0B48}"], + ["o", ["O"]] => ["ଓ", "\u{0B4B}"], + "oa" => ["ଓଽ", "\u{0B4B}ଽ"], + ["ou", "au", "ow"] => ["ଔ", "\u{0B4C}"], + + ["AN", "AM"] => ["ଅଂ ", $am], + ["aH"] => ["ଅଃ ", $visarga] + +list :can_make_cluster do +consonants "k" => "କ", + ["kh", ["k", "Kh"]] => "ଖ", + "g" => "ଗ", + ["gh", ["g"]] => "ଘ", + "ng" => "ଙ", + + "ch" => "ଚ", + ["Ch", ["ch"]] => "ଛ", + "j" => "ଜ", + ["jh", ["j"]] => "ଝ", + "ny" => "ଞ", + + "T" => "ଟ", + ["Th", ["T"]] => "ଠ", + "D" => "ଡ", + ["Dd", ["dd", "D"]] => ["\u{0B5C}", "ଡ#{$nukta}"], + "Dh" => "ଢ", + ["DH", ["Dh", "D"]] => ["\u{0B5D}", "ଢ#{$nukta}"], + "N" => "ଣ", + + "t" => "ତ", + ["th", ["t"]] => "ଥ", + "d" => "ଦ", + ["dh", ["d"]] => "ଧ", + "n" => "ନ", + + "p" => "ପ", + ["f", ["ph"]] => "ଫ", + ["ph", ["f"]] => "ਫ਼", + ["b", ["v"]] => "ବ", + ["bh", ["b"]] => "ଭ", + "m" => "ମ", + + ["y", ["j"]] => "ଯ", + "Y" => "ୟ", + "r" => ["ର", "ର#{$virama}"], + "l" => ["ଲ", "ଲ#{$virama}"], + ["ll", ["l"]] => "ଳ", + ["v", ["b"]] => "ଵ", + "w" => "ୱ", # it is o + v + "s" => "ସ", + ["sh", ["Sh", "s"]] => "ଶ", + ["Sh", ["sh"]] => "ଷ", + "h" => "ହ" +end + +# consonant conjuncts of 2 or more consonants +consonants "ktr" => "କ୍ତ୍ର", + "kShsh" => "କ୍ଷ୍ଶ", + "kShN" => "କ୍ଷ୍ଣ", + "kShm" => "କ୍ଷ୍ମ", + "kShY" => "କ୍ଷ୍ୟ", + "ngkSh" => "ଙ୍କ୍ଷ", + "chChb" => "ଚ୍ଛ୍ବ", + "jjb" => "ଜ୍ଜ୍ବ", + "jjw" => "ଜ୍ଜ୍ୱ", + "jjv" => "ଜ୍ଜ୍ଵ", + "ttb" => "ତ୍ତ୍ବ", + "tmy" => "ତ୍ମ୍ଯ", + "tmY" => "ତ୍ମ୍ୟ", + "tsm" => "ତ୍ସ୍ମ", + "tsn" => "ତ୍ସ୍ନ", + "tsy" => "ତ୍ସ୍ଯ", + "tsY" => "ତ୍ସ୍ୟ", + "ddhw" => "ଦ୍ଧ୍ୱ", + "ntb" => "ନ୍ତ୍ବ", + "ntr" => "ନ୍ତ୍ର", + "ntrY" => "ନ୍ତ୍ର୍ୟ", + "ndb" => "ନ୍ଦ୍ବ", + "ndY" => "ନ୍ଦ୍ୟ", + "ndr" => "ନ୍ଦ୍ର", + "ndhr" => "ନ୍ଧ୍ର", + "ndw" => "ନ୍ଧ୍ୱ", + "nwtw" => "ନ୍ୱ୍ତ୍ୱ", + "nnY" => "ନ୍ନ୍ୟ", + "rghY" => "ର୍ଘ୍ୟ", + "rchCh" => "ର୍ଚ୍ଛ", + "rshsh" => "ର୍ଶ୍ଶ", + "rttY" => "ର୍ତ୍ତ୍ୟ", + "rtsY" => "ର୍ତ୍ସ୍ୟ", + "rdr" => "ର୍ଦ୍ର", + "rdDhY" => "ର୍ଦ୍ଢ୍ୟ", + "ryY" => "ର୍ଯ୍ୟ", + "rshb" => "ର୍ଶ୍ବ", + "shDb" => "ଶ୍ଡ୍ବ", + "Shkr" => "ଷ୍କ୍ର", + "Shtr" => "ଷ୍ତ୍ର", + "ShThr" => "ଷ୍ଠ୍ର", + "Shpr" => "ଷ୍ପ୍ର", + "ShTr" => "ଷ୍ଟ୍ର", + "stb" => "ସ୍ତ୍ବ", + "stY" => "ସ୍ତ୍ୟ", + "str" => "ସ୍ତ୍ର", + "sthy" => "ସ୍ଥ୍ଯ", + "sthY" => "ସ୍ଥ୍ୟ" + +ignore_duplicates true +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + end +end + +# Half forms for ମ +consonants(combine can_make_cluster, ["m*"] => ["ମ#{$virama}*1"]) + +generate_cv + +consonants(combine get_consonants, ["*a"] => ["*1"]) + +ignore_duplicates true + +#consonants [["m"]] => "ମ" + +# Need to replace this when we have a way to pass value2 and value3 in combine +get_vowels.each do |vowel| + if vowel.match_type == 1 + consonant_vowel_combinations ["m#{vowel.pattern}"] => ["ମ#{vowel.value2}"] + else + consonant_vowel_combinations [["m#{vowel.pattern}"]] => ["ମ#{vowel.value2}"] + end +end +symbols ["m", ["n"]] => $am, + [["n", "m"]] => $an, + ["|"] => "।" + +symbols({:priority => :low}, ["aan", "aam"] => $an) + +numbers "0" => "୦", + "1" => "୧", + "2" => "୨", + "3" => "୩", + "4" => "୪", + "5" => "୫", + "6" => "୬", + "7" => "୭", + "8" => "୮", + "9" => "୯" diff --git a/govarnam/schemes/schemes/pa/pa.scheme b/govarnam/schemes/schemes/pa/pa.scheme new file mode 100644 index 0000000..b236d67 --- /dev/null +++ b/govarnam/schemes/schemes/pa/pa.scheme @@ -0,0 +1,141 @@ +# encoding: utf-8 + +## +# Copyright (C) Amanpreet Kaur +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "pa" +identifier "pa" +display_name "Punjabi" +author "Amanpreet Kaur" + +ignore_duplicates false + +$virama = "\u{0A4D}" +$nukta = "\u{0A3C}" +$addak = "\u{0A71}" +$an = "\u{0A01}" +$am = "\u{0A02}" + +virama "~" => "\u{0A4D}" +visarga "H" => "\u{0A03}" + +vowels "a" => "ਅ", + ["aa", "A", ["a"]] => ["ਆ", "\u{0A3E}"], + "i" => ["ਇ", "\u{0A3F}"], + ["ee", "I", "ii", ["i"]] => ["ਈ", "\u{0A40}"], + "u" => ["ਉ", "\u{0A41}"], + ["uu", "oo", "U", ["u"]] => ["ਊ", "\u{0A42}"], + ["e", "ae"] => ["ਏ", "\u{0A47}"], + ["ai", "ei"] => ["ਐ", "\u{0A48}"], + ["o", ["O"]] => ["ਓ", "\u{0A4B}"], + ["ou", "ow"] => ["ਔ", "\u{0A4C}"], + ["an",["en"]] => "\u{0A70}" + +list :can_make_cluster do +consonants "k" => "ਕ", + ["kh", ["Kh"]] => "ਖ", + [["kh"], "KH"] => ["ਖ਼", "ਖ#{$nukta}"], + "g" => "ਗ", + ["G"] => ["ਗ਼", "ਗ#{$nukta}"], + ["gh", ["Gh"]] => "ਘ", + "ng" => "ਙ", + "ch" => "ਚ", + ["CH", ["ch"]] => "ਛ", + "j" => "ਜ", + "z" => ["ਜ਼", "ਜ#{$nukta}"], + "jh" => "ਝ", + "ny" => "ਞ", + ["T", ["t"]] => "ਟ", + "Th" => "ਠ", + ["D", ["d"]] => "ਡ", + "Dh" => ["ਢ"], + ["N",["n"]] => "ਣ", + + "t" => "ਤ", + ["th", ["dh"]] => "ਥ", + "d" => "ਦ", + "dh" => "ਧ", + [["n"]] => "ਨ", + "p" => "ਪ", + "ph" => "ਫ", + "f" => ["ਫ਼", "ਫ#{$nukta}"], + "b" => "ਬ", + "bh" => "ਭ", + [["m"]] => "ਮ", + "y" => "ਯ", + "ya" => "ਆ", + "r" => ["ਰ", "ਰ#{$virama}"], + "l" => ["ਲ", "ਲ#{$virama}"], + "ll" => "ਲ਼", + ["v", "w"] => "ਵ", + ["sh", ["Sh", "SH"]] => "ਸ਼", + ["s"] => "ਸ", + ["h"] => "ਹ", + ["R","RR", ["ri", "ru"]] => "ੜ" +end + + +ignore_duplicates true +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + + if c1.value1 == c2.value1 + consonants [["#{c1.pattern}#{c2.pattern}", "#{c1.pattern}"]] => ["#{$addak}#{c1.value1}"] + end + end +end + +# Half forms for ਮ +consonants(combine can_make_cluster, ["m*"] => ["ਮ#{$virama}*1"]) +consonants(combine can_make_cluster, ["n*"] => ["ਨ#{$virama}*1"]) + +generate_cv + +consonants(combine get_consonants, ["*a"] => ["*1"]) + +# Need to replace this when we have a way to pass value2 and value3 in combine +get_vowels.each do |vowel| + if vowel.match_type == 1 + consonant_vowel_combinations ["m#{vowel.pattern}"] => ["ਮ#{vowel.value2}"] + else + consonant_vowel_combinations [["m#{vowel.pattern}"]] => ["ਮ#{vowel.value2}"] + end +end + +tag "chill" do + consonants "n" => ["\u{0A70}"], + "au" => ["\u{0A4C}"] + +end + +symbols ["m", ["n"]] => $am, + ["n", ["m"]] => $an, + ["|"] => "।", + "om" => "ੴ", + "#" => $addak, + "." => $nukta, + "M" => ["\u{0A70}"] + +symbols({:priority => :low}, ["aan", "aam"] => $an, "om" => "ੴ") + +ignore_duplicates false + + +numbers "0" => "੦", + "1" => "੧", + "2" => "੨", + "3" => "੩", + "4" => "੪", + "5" => "੫", + "6" => "੬", + "7" => "੭", + "8" => "੮", + "9" => "੯" diff --git a/govarnam/schemes/schemes/sa/sa.scheme b/govarnam/schemes/schemes/sa/sa.scheme new file mode 100644 index 0000000..672517f --- /dev/null +++ b/govarnam/schemes/schemes/sa/sa.scheme @@ -0,0 +1,312 @@ +# encoding: utf-8 + +## +# Copyright (C) Rohit Bansal +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "sa" +identifier "sa" +display_name "Sanskrit" +author "Rohit Bansal" + +stable true + +# +# since Sanskrit uses Devnagri script, this file should be very similar to Hindi file +# Sanskrit has no concept of "nukta", "anuswar" (except trailing m sound) or chandrabindu +# + +ignore_duplicates false + +$virama = "\u{094D}" +$an = "\u{0901}" +$am = "\u{0902}" + +virama "~" => "\u{094D}" + +vowels "a" => "अ", + ["aa", "A", ["a"]] => ["आ", "ा"], + ["AA", ["aa"]] => ["आऽऽ", "ाऽऽ"], + "i" => ["इ", "ि"], + ["ee", "I", "ii", ["i"]] => ["ई", "ी"], + "u" => ["उ", "ु"], + ["uu", "oo", "U", ["u"]] => ["ऊ", "ू"], + ["R", ["Ri"]] => ["ऋ", "\u{0943}"], + ["RR", ["R"]] => ["ॠ", "\u{0944}"], + ["Lr", ["l"]] => ["ऌ", "\u{0962}"], + ["LLr", ["Lr"]] => ["ॡ", "\u{0963}"], + "e" => ["ए", "े"], + "ea" => ["एऽ", "ेऽ"], + ["eA", ["ea"]] => ["एऽऽ", "ेऽऽ"], + ["ai", "ei"] => ["ऐ", "ै"], + "o" => ["ओ", "ो"], + "oa" => ["ओऽ", "ोऽ"], + ["ou", "au", "ow"] => ["औ", "ौ"], + ["M"] => ["अं" ,"ं"], + ["H"] => ["अः", "ः"] + +list :can_make_cluster do +consonants "k" => "क", + ["kh", ["k"]] => "ख", + "g" => "ग", + ["gh", ["g"]] => "घ", + "NG" => "ङ", + + ["ch", ["c"]] => "च", + ["Ch", ["ch"]] => "छ", + "j" => "ज", + ["jh", ["J", "j"]] => "झ", + "NY" => "ञ", + + "T" => "ट", + ["Th", ["T"]] => "ठ", + "D" => "ड", + ["Dh", ["D"]] => ["ढ"], + "N" => "ण", + + "t" => "त", + ["th", ["t"]] => "थ", + "d" => "द", + ["dh", ["d"]] => "ध", + "n" => "न", + + "p" => "प", + ["f", ["ph"]] => "फ", + "b" => "ब", + ["bh", ["b"]] => "भ", + "m" => "म", + + "y" => "य", +# "r" => ["र", "र#{$virama}"], +# "l" => ["ल", "ल#{$virama}"], + "r" => "र", + "l" => "ल", + ["v", "w"] => "व", + "s" => "स", + ["sh", ["s"]] => "श", + ["Sh", ["sh"]] => "ष", + "h" => "ह" +end + +ignore_duplicates true +can_make_cluster.each do |c1| + can_make_cluster.each do |c2| + if c1.match_type == 1 + consonants ["#{c1.pattern}#{c2.pattern}"] => ["#{c1.value1}#{$virama}#{c2.value1}"] + else + consonants [["#{c1.pattern}#{c2.pattern}"]] => ["#{c1.value1}#{$virama}#{c2.value1}"] + end + end +end + +# following are the consonants clusters, having more than 2 consonants +# few sequences are commented as the software can't handle such a long sequence +consonants "kkN" => "क्क्ण", + "kky" => "क्क्य", + "kty" => "क्त्य", + "ktr" => "क्त्र", + ["ktv", "ktw"] => "क्त्व", + "kpr" => "क्प्र", + "kthn" => "क्थ्न", + "kny" => "क्न्य", + "kShN" => "क्ष्ण", + "kShm" => "क्ष्म", +# "kShmy" => "क्ष्म्य", + "kShy" => "क्ष्य", + ["kShv", "kShw"] => "क्ष्व", + ["ksv", "ksw"] => "क्स्व", + "gjy" => "ग्ज्य", + ["gdhv", "gdhw"] => "ग्ध्व", + "gny" => "ग्न्य", + "gbhy" => "ग्ब्य", + "gry" => "ग्र्य", + ["ghny", ["gny"]] => "घ्न्य", + ["NGkt", "NGqt"] => "ङ्क्त", + ["NGkSh", "NGqSh"] => "ङ्क्ष", +# ["NGkShv", "NGkShw", ["NGqShv", "NGqShw"]] => "ङ्क्ष्व", + ["NGkhy", ["NGky"]] => "ङ्ख्य", + "NGgy" => "ङ्ग्य", + "NGgr" => "ङ्ग्र", + ["NGghy", ["NGgy"]] => "ङ्घ्य", + ["NGghr", ["NGgr"]] => "ङ्घ्र", + ["chchy", "ccy"] => "च्च्य", + ["chChr", "cChr"] => "च्छ्र", + ["chChv", "chChw", ["cChv", "cChw"]] => "च्छ्व", + "jjNY" => "ज्ज्ञ", + "jjy" => "ज्ज्य", + ["jjv", "jjw"] => "ज्ज्व", + "jNYy" => "ज्ञ्य", + "NYjNY" => "ञ्ज्ञ", + "NYshr" => "ञ्श्र", + ["NYshv", "NYshw"] => "ञ्श्व", + "NDy" => "ण्ड्य", + "NDr" => "ण्ड्र", + "tkr" => "त्क्र", + "tkSh" => "त्क्ष", + "tty" => "त्त्य", + "ttr" => "त्त्र", + ["ttv", "ttw"] => "त्त्व", + "tny" => "त्न्य", + "tpr" => "त्प्र", + "tmy" => "त्म्य", + "try" => "त्र्य", + ["tvy", "twy"] => "त्व्य", + "tst" => "त्स्त", +# "tstr" => "त्स्त्र", + "tsth" => "त्स्थ", + "tsn" => "त्स्न", + "tsm" => "त्स्म", +# "tsny" => "त्स्न्य", + "tsy" => "त्स्य", + "tsr" => "त्स्र", + ["tsv", "tsw"] => "त्स्व", + "dgr" => "द्ग्र", + "ddy" => "द्द्य", + "ddr" => "द्द्र", + ["ddv", "ddw"] => "द्द्व", + "ddhy" => "द्ध्य", + "ddhr" => "द्ध्र", + ["ddhv", "ddhw"] => "द्ध्व", + "dbr" => "द्ब्र", + "dbh" => "द्ब्ह", + "dbhy" => "द्ब्य", + "dbhr" => "द्ब्र", + "dry" => "द्र्य", + ["dvy", "dwy"] => "द्व्य", + ["dvr", "dwr"] => "द्व्र", + "dhny" => "ध्न्य", + "dhry" => "ध्र्य", + "nkr" => "न्क्र", + "nkl" => "न्क्ल", + "nkSh" => "न्क्ष", + "nty" => "न्त्य", + "ntr" => "न्त्र", +# "ntry" => "न्त्र्य", + ["ntv", "ntw"] => "न्त्व", +# "ntsy" => "न्त्स्य", + "ndy" => "न्द्य", + "ndr" => "न्द्र", + ["ndv", "ndw"] => "न्द्व", + "ndhy" => "न्ध्य", + "ndhr" => "न्ध्र", + "nny" => "न्न्य", + ["nnv", "nnw"] => "न्न्व", + "npr" => "न्प्र", + "nbr" => "न्ब्र", + "nbhr" => "न्भ्र", + ["nvy", "nwy"] => "न्व्य", + "nsth" => "न्स्थ", + "nsm" => "न्स्म", + ["nsv", "nsw"] => "न्स्व", + "pty" => "प्त्य", + ["ptv", "ptw"] => "प्त्व", + "psy" => "प्स्य", + ["psv", "psw"] => "प्स्व", + "bdhv" => "ब्ध्व", + "mby" => "म्ब्य", + "mpr" => "म्प्र", + "rkSh" => "र्क्ष", +# "rkShy" => "र्क्ष्य", + "rgy" => "र्ग्य", + "rgr" => "र्ग्र", + "rghy" => "र्घ्य", + "rNGg" => "र्ङ्ग", + ["rchCh", "rcCh"] => "र्च्छ", + ["rchy", "rcy"] => "र्च्य", + "rjNY" => "र्ज्ञ", + ["rjv", "rjw"] => "र्ज्व", + "rNy" => "र्ण्य", + "rtm" => "र्त्म", + "rty" => "र्त्य", + "rtr" => "र्त्र", + ["rtv", "rtw"] => "र्त्व", + "rts" => "र्त्स", +# "rtsny" => "र्त्स्न्य", + "rtt" => "र्त्त", +# "rtty" => "र्त्त्य", +# "rtsy" => "र्त्स्य", + "rthy" => "र्थ्य", + "rddh" => "र्द्ध", +# "rddhr" => "र्द्ध्र", + "rdy" => "र्द्य", + "rdr" => "र्द्र", + ["rdv", "rdw"] => "र्द्व", + "rdhn" => "र्ध्न", + "rdhm" => "र्ध्म", + "rdhy" => "र्ध्य", + "rdhr" => "र्ध्र", + ["rdhv", "rdhw"] => "र्ध्व", + "rny" => "र्न्य", + "rbr" => "र्ब्र", + "rbhy" => "र्भ्य", + "rbhr" => "र्भ्र", + "rmy" => "र्म्य", + ["rvy", "rwy"] => "र्व्य", + ["rShv", "rShw"] => "र्ष्व", + "rShT" => "र्ष्ट", + "rShN" => "र्ष्ण", + "rShy" => "र्ष्य", + "rhy" => "र्ह्य", + "rhr" => "र्ह्र", + "lky" => "ल्क्य", + ["shchy", "shcy"] => "श्च्य", + "shry" => "श्र्य", + ["shvy", "shwy"] => "श्व्य", + "Shkr" => "ष्क्र", + "ShTy" => "ष्ट्य", + "ShTr" => "ष्ट्र", +# "ShTry" => "ष्ट्र्य", + ["ShTv", "ShTw"] => "ष्ट्व", + "ShThy" => "ष्ठ्य", + "Shthr" => "ष्थ्र", + "Shthy" => "ष्थ्य", + "ShNy" => "ष्ण्य", + "Shpr" => "ष्प्र", + "sty" => "स्त्य", + "str" => "स्त्र", +# "stry" => "स्त्र्य", + ["stv", "stw"] => "स्त्व", + "sthy" => "स्थ्य", + "smy" => "स्म्य", + "sny" => "स्न्य", + ["ssv", "ssw"] => "स्स्व" + +# Half forms for म +consonants(combine can_make_cluster, ["m*"] => ["म#{$virama}*1"]) + +generate_cv + +consonants(combine get_consonants, ["*a"] => ["*1"]) + +ignore_duplicates false + +#consonants ["m"] => "म" + +# Need to replace this when we have a way to pass value2 and value3 in combine +get_vowels.each do |vowel| + if vowel.match_type == 1 +# consonant_vowel_combinations ["m#{vowel.pattern}"] => ["म#{vowel.value2}"] + else +# consonant_vowel_combinations [["m#{vowel.pattern}"]] => ["म#{vowel.value2}"] + end +end + +#symbols ["m", ["n"]] => $am, +# [["n", "m"]] => $an, +symbols [".","|"] => "।", + ["om"] => "\u{0950}" + +symbols({:priority => :low}, ["aan", "aam"] => $an) + +numbers "0" => "०", + "1" => "१", + "2" => "२", + "3" => "३", + "4" => "४", + "5" => "५", + "6" => "६", + "7" => "७", + "8" => "८", + "9" => "९" diff --git a/govarnam/schemes/schemes/ta/ta.scheme b/govarnam/schemes/schemes/ta/ta.scheme new file mode 100644 index 0000000..1a58433 --- /dev/null +++ b/govarnam/schemes/schemes/ta/ta.scheme @@ -0,0 +1,93 @@ +# encoding: utf-8 + +## +# Copyright (C) Navaneeth.K.N +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "ta" +identifier "ta" +display_name "Tamil" +author "Navaneeth KN/Kumaran Venkataraman/Kishore96in" + +$virama = "்" +virama "~" => "்" + +infer_dead_consonants true + +vowels "a" => "அ", + ["aa", "A", ["a"]] => ["ஆ", "ா"], + ["i", ["y", "e"]] => ["இ", "ி"], + ["ii", "I", "ee", ["i"]] => ["ஈ", "ீ"], + "u" => ["உ", "ு"], + ["uu", "oo", "U", ["u"]] => ["ஊ","ூ"], + ["e",["ye"]] => ["எ", "ெ"], + ["E", ["e"]] => ["ஏ", "ே"], + ["ai", ["ei"]] => ["ஐ", "ை"], + "o" => ["ஒ", "ொ"], + ["O", ["o"]] => ["ஓ", "ோ"], + ["au", "ow", "ou"] => ["ஔ", "ௌ"], + ["q"] => "ஃ" + +consonants ["ka", ["qa"], "ga", ["ha"]] => "க", + ["nga"] => "ங", + ["sa", "cha"] => "ச", + ["ja", "za"] => "ஜ", + ["NJa", ["nja","nya","nga"]] => "ஞ", + ["ta","da", ["tha"]] => "ட", + ["Na",["na"]] => "ண", + ["tha", "dha", ["ta","da"]] => "த", + ["nna",["Na","na"]] => "ன", + ["na"] => "ந", + ["pa", "ba", ["pha", "bha"]] => "ப", + ["ma"] => "ம", + ["ya"] => "ய", + ["ra"] => "ர", + ["va", "wa"] => "வ", + ["Ra", "RRa", ["ra", "tra", "dra"]] => "ற", + "la" => "ல", + ["La", ["lla", "la"]] => "ள", + ["zha",["la"]] => "ழ", + ["Sa"] => "ஶ", + ["sha"] => "ஷ", + ["ssa",["sa"]] => "ஸ", + ["ha"] => "ஹ", + ["fa"] => "ஃப" + +consonants ["kk*"] => ["க#{$virama}க#{$virama}*1"], + [["nga", "nka"]] => ["ங#{$virama}க"], + [["tra"]] => ["ற#{$virama}ற"], + ["ncha", "nja"] => ["ஞ#{$virama}ச"] + +infer_dead_consonants false + +generate_cv + +# generating combination for ei +# sei = செய்(sey) +get_dead_consonants.each do |dead_c| + if dead_c.match_type == 1 + consonant_vowel_combinations ["#{dead_c.pattern}ei"] => ["#{dead_c.value1.chomp($virama)}ெய#{$virama}"] + else + consonant_vowel_combinations [["#{dead_c.pattern}ei"]] => ["#{dead_c.value1.chomp($virama)}ெய#{$virama}"] + end +end + +infer_dead_consonants true +ignore_duplicates true + +tag "chill" do + consonants "n" => ["ன்"] +end + +numbers "0" => "௦", + "1" => "௧", + "2" => "௨", + "3" => "௩", + "4" => "௪", + "5" => "௫", + "6" => "௬", + "7" => "௭", + "8" => "௮", + "9" => "௯" diff --git a/govarnam/schemes/schemes/te/te.scheme b/govarnam/schemes/schemes/te/te.scheme new file mode 100644 index 0000000..747cc80 --- /dev/null +++ b/govarnam/schemes/schemes/te/te.scheme @@ -0,0 +1,115 @@ +# encoding: utf-8 + +## +# Copyright (C) Raja Reddy Karri +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +language_code "te" +identifier "te" +display_name "Telugu" +author "Raja Reddy Karri, Varun Reddy" +stable true + +ignore_duplicates false + +$virama = "\u{0C4D}" +$visarga = "\u{0C03}" +$avagraha = "\u{0C3D}" + +$an = "\u{0C01}" +$am = "\u{0C02}" + +virama "~" => $virama +visarga "H" => "\u{0C03}" + +vowels "a" => "అ", + ["aa", "A", ["a"]] => ["ఆ", "\u{0C3E}"], + "i" => ["ఇ", "\u{0C3F}"], + ["ee", "I", "ii", ["i"]] => ["ఈ", "\u{0C40}"], + "u" => ["ఉ", "\u{0C41}"], + ["uu", "oo", "U", ["u"]] => ["ఊ", "\u{0C42}"], + "Ru" => ["ఋ", "\u{0C43}"], + ["RRu", ["Ru"]] => ["ౠ", "\u{0C44}"], + "lLu" => ["ఌ", "\u{0C62}"], + ["lLLu", ["Lu"]] => ["ౡ", "\u{0C63}"], + "e" => ["ఎ", "\u{0C46}"], + ["E", ["ee", "ae", "ye"]] => ["ఏ", "\u{0C47}"], + ["ai", "ei"] => ["ఐ", "\u{0C48}"], + ["o", ["O"]] => ["ఒ", "\u{0C4A}"], + ["O", ["oo"]] => ["ఓ", "\u{0C4B}"], + ["au", "ou", "ow"] => ["ఔ", "\u{0C4C}"] + +infer_dead_consonants true + +consonants ["ka"] => "క", + ["kha", ["ka"]] => "ఖ", + "ga" => "గ", + ["gha", ["ga"]] => "ఘ", + ["NGa",["nga"]] => "ఙ", + + "cha" => "చ", + ["Cha", ["cha"]] => "ఛ", + "ja" => "జ", + ["jha", ["ja"]] => "ఝ", + "Nnya" => "ఞ", + + "ta" => "ట", + ["Ta"] => "ఠ", + ["da"] => "డ", + ["Da", ["da"]] => "ఢ", + + ["tha", ["ta"]] => "త", + ["Tha" , ["dha"]]=> "థ", + ["dha"] => "ద", + ["Dha"] => "ధ", + + "pa" => "ప", + ["pha", ["pa"]] => "ఫ", + "ba" => "బ", + ["bha", ["ba"]] => "భ", + + "ya" => "య", + "ra" => "ర", + ["RRA", ["rra"]] => "ఱ", + "la" => "ల", + ["lla",["La"]] => "ళ", + ["llla", ["LLLa"]] => "\u{0C34}", + ["va", ["wa"]] => "వ", + "sa" => "స", + [["Sa"],"sha"] => "శ", + ["Sha", ["sa"]] => "ష", + "ha" => "హ" + +consonants({:accept_if => :starts_with}, ["ca"] => "క") + +infer_dead_consonants false + +consonants "na" => "న", + [["na"], "Na"] => "ణ", + "ma" => "మ" + +infer_dead_consonants true + +generate_cv + +symbols({:priority => :low}, ["aan", "aam"] => $an) + +tag "chill" do + consonants ["m", ["n"]] => ["ం"] + consonants ["M"] => ["మ\u{0C4D}"] + consonants "N" => ["ణ\u{0C4D}"] + consonants "n" => ["న\u{0C4D}"] +end + +numbers "0" => "౦", + "1" => "౧", + "2" => "౨", + "3" => "౩", + "4" => "౪", + "5" => "౫", + "6" => "౬", + "7" => "౭", + "8" => "౮", + "9" => "౯" diff --git a/govarnam/schemes/scripts/ANALYSIS.md b/govarnam/schemes/scripts/ANALYSIS.md new file mode 100644 index 0000000..900951c --- /dev/null +++ b/govarnam/schemes/scripts/ANALYSIS.md @@ -0,0 +1,45 @@ +## dhi + +Conjunct Frequency Percentage-among-them + +ധി 154 62 +ദ്ധി 55 22 +ത്ഥി 18 7 +ധീ 8 3 +ദ്ധീ 7 2 +ഥി 4 1 +ഥീ 0 +ത്ഥീ 0 + += 246 + +## tha + +ത 978 45 +ത്ത 480 22 +താ 436 20 +ത്താ 99 4.6 +ഥ 98 4.5 +ഥാ 33 1.5 +ത്ഥ 16 0.7 +ത്ഥാ 4 0.01 + += 2144 + +## ya + +യ 1852 68 +യാ 839 32 + += 2691 + +## nda + +ണ്ടാ 343 52 +ണ്ട 265 40 +ന്ദ 39 5 +ണ്ഡ 7 +ന്ദാ 1 +ണ്ഡാ 1 + += 656 diff --git a/govarnam/schemes/scripts/README.md b/govarnam/schemes/scripts/README.md new file mode 100644 index 0000000..6affdcc --- /dev/null +++ b/govarnam/schemes/scripts/README.md @@ -0,0 +1,87 @@ +# Scripts + +## Remove symbols from word frequency report + +This script removes VST symbols from a word frequency report file. + +Sometimes frequency report will have items like "ലൂ", "ഓ" which is unnecessary because tokenizer can make these on its own. + +GoVarnam won't learn single conjuncts anyway, so why keep it in report ? Remove them with : + +```bash +python3 frequency-report-remove-symbols.py scheme.vst wordFrequencyReport.txt outputFile.txt +``` + +### Word Frequency Report File + +A word frequency report file has this format : +``` +word frequency +``` +Example: +``` +എന്ത് 14569 +ഇത് 2045 +വർഗ്ഗം 254 +എന്ന 254 +ഒരു 254 +താളിലേക്ക് 254 +ഫലകം 254 +``` + +This file is made from analysing usage of words in internet. [This repo](https://github.com/AI4Bharat/indicnlp_corpus#text-corpora) has a premade vocab frequency file for some Indian languages. [Indic Keyboard](https://gitlab.com/indicproject/dictionaries) also has one. + +### Normalize Frequency + +A frequency report may have large difference between the first word and last word like this : + +``` +എന്ത് 14569000 +... +... +ഫലകം 254 +``` + +This is bad because suggestions can come out wrong. We need to normalize these values between a min and max. + +To normalize frequency of words between a min value (15) and max value (255), we can use this : + +``` +perl frequency-normalizer.pl frequencyReport.txt 15 255 > output-report.txt +``` + +## Populate weight column in VST + +In GoVarnam's VST, we will have a weight for each possibility symbol. This is to make the tokenizer output better for possible suggestions. More a symbol is in popular usage, the more that word have greater weight in tokenizer output. + +* Get a word frequency report file (explained at the top of this README) + +Such a file helps to calculate symbol frequency very easy. We just need to make a hashmap of each symbols in a word and add the corresponding word frequency value. + +After we go through the entire list of words, we will have a hashmap of symbol frequency. + +* Make the symbol frequency report : +``` +python3 symbol-frequency-maker.py scheme.vst word-frequency.txt symbol-frequency.txt +``` + +Now the output file will have a similar content: +``` +അ 951134 +എ 763499 +വ 739865 +നി 710719 +ക 500238 +രു 478358 +``` + +* Normalize the frequency : + +``` +perl frequency-normalizer.pl symbol-frequency.txt 0 100 > symbol-frequency-normalized.txt +``` + +* Now, use this file to populate the weight column in VST : +``` +python3 symbol-weight-update-in-vst.py ml.vst symbol-frequency-normalized.txt +``` diff --git a/govarnam/schemes/scripts/frequency-normalizer.pl b/govarnam/schemes/scripts/frequency-normalizer.pl new file mode 100644 index 0000000..b967524 --- /dev/null +++ b/govarnam/schemes/scripts/frequency-normalizer.pl @@ -0,0 +1,53 @@ +# Reweigh the words +# Sample: word 8671269 to word 200 +# Source: command line argument +# Output to terminal + +# Biggest value = # of lines. +# Divide this by 240 and round up (255-14 to avoid 0-15 values) +# Divide all other values (lines left in the list) by that number and round down. +# All values should now be between 15 and 254. + +if( $#ARGV != 2 ){ + print "Need 3 arguments: \n"; + die; +} + +# Open original file +use utf8; +open FILE, $ARGV[0] or die $!; +my $count=0; + +my $min = $ARGV[1]; +my $max = $ARGV[2]; + +# Count the # of lines +while () { + $count++; +} + +# Calculate the divider to ensure results between min and max +my $divider = int( $count / ($max - $min)) + 1; + +sub is_integer { $_[0] =~ /^[+-]?\d+$/ } +# Re-open the source file and update the weight +open FILE, "<:encoding(utf8)", $ARGV[0] or die $!; + +# remove ’, “, ।, —, ‘, ·, −, °, ”, ॥ +while (my $line = ) { + $count--; + + # Replace the weight if its a word line, + # otherwise print without actions + if ($line =~ /\s/) { + my $weighed = int( $count / $divider) + $min; + my ($name) = $line =~ m/(.*)\s/; + if (length($name) > 1 && !is_integer($name)) { + $line =~ s/(\d*[.])?\d+/$weighed/g; + utf8::encode($line); + print $line; + } + } +} + +close FILE; diff --git a/govarnam/schemes/scripts/frequency-report-remove-symbols.py b/govarnam/schemes/scripts/frequency-report-remove-symbols.py new file mode 100644 index 0000000..d95026e --- /dev/null +++ b/govarnam/schemes/scripts/frequency-report-remove-symbols.py @@ -0,0 +1,53 @@ +import sqlite3 +import sys + +desc = ''' +Usage: script.py scheme.vst inputFile.txt outputFile.txt + +This script removes VST symbol words from a word frequency report file. +Sometimes frequency report will have items like "ലൂ", "ഓ" which +is unnecessary because tokenizer have these and can make these. + +GoVarnam won't learn single conjuncts anyway, so why keep it in report ? + +INPUT FILE MUST BE WORD FREQUENCY REPORT. FORMAT: +wordfrequency +എന്ത് 14569 +ഇത് 2045 +''' + +if len(sys.argv) != 4: + print(desc) + sys.exit(0) + +db = sys.argv[1] +file = sys.argv[2] +outputFile = sys.argv[3] + +con = sqlite3.connect(db) +cur = con.cursor() + +cur.execute("SELECT value1 FROM symbols") +value1 = cur.fetchall() +symbols = [] +for s in value1: + symbols.append(s[0]) + +base = 0 +with open(file, "r", encoding="utf8", errors='ignore') as fileInput: + with open(outputFile, "w") as fileOutput: + processedCount = 0 + + for line in fileInput: + word, frequency = line.split(" ") + + if word in symbols: + continue + + fileOutput.write(line) + + processedCount += 1 + + if processedCount % 1000 == 0: + print("Processed " + str(processedCount) + " lines") + diff --git a/govarnam/schemes/scripts/make-pack.py b/govarnam/schemes/scripts/make-pack.py new file mode 100644 index 0000000..0b7dba7 --- /dev/null +++ b/govarnam/schemes/scripts/make-pack.py @@ -0,0 +1,68 @@ +import json +import os +import re +import sys + +desc = ''' +Usage: script.py + +Example: script.py ml ./ml-basic + +Pack JSON file name is pack.json + +WordFrequency report file should be word-frequency-report.txt +Pattern file should be patterns.txt +''' + +if len(sys.argv) != 3: + print(desc) + sys.exit(0) + +schemeID = sys.argv[1] +packDir = os.path.realpath(sys.argv[2]) + +packJSON = os.path.join(packDir, "pack.json") +wordsFile = os.path.join(packDir, "word-frequency-report.txt") +patternsFile = os.path.join(packDir, "patterns.txt") + +os.environ["VARNAM_LEARNINGS_DIR"] = packDir + +# Learn from frequency report +if os.path.exists(wordsFile): + os.system("varnamcli -s %s -learn-from-file %s" % (schemeID, wordsFile)) + +# Learn from patterns +if os.path.exists(patternsFile): + os.system("varnamcli -s %s -train-from-file %s" % (schemeID, patternsFile)) + +with open(packJSON, "r") as jsonFile: + packInfo = json.load(jsonFile) + +# Export +os.system("varnamcli -s %s -export %s" % (schemeID, os.path.join(packDir, packInfo["identifier"]))) + +pageIndex = 1 +wordsCount = 0 +for v in packInfo["pages"]: + vlfPath = os.path.join(packDir, v["identifier"] + ".vlf") + vlfContents = open(vlfPath).read() + + v["page"] = pageIndex + v["size"] = os.path.getsize( + vlfPath + ) + + # Gets the first word's confidence + firstConfidence = re.search(r'c":(.*?),', vlfContents).group(1) + v["description"] = "Words with confidence lesser than " + firstConfidence + + wordsCount += len(re.findall(r'"c"', vlfContents)) + pageIndex += 1 + +packInfo["total_words"] = wordsCount +packInfo["pages_count"] = len(packInfo["pages"]) + +with open(packJSON, "w") as jsonFile: + json.dump(packInfo, jsonFile, indent=2, ensure_ascii=False) + +print("Finished making pack. Hopefully pack.json has the correct number of pages") diff --git a/govarnam/schemes/scripts/symbol-frequency-maker.py b/govarnam/schemes/scripts/symbol-frequency-maker.py new file mode 100644 index 0000000..8b1444c --- /dev/null +++ b/govarnam/schemes/scripts/symbol-frequency-maker.py @@ -0,0 +1,74 @@ +import sqlite3 +import sys + +desc = ''' +Usage: script.py scheme.vst wordReportFile.txt outputSymbolReportFile.txt + +INPUT FILE MUST BE WORD FREQUENCY REPORT. FORMAT: +wordfrequency +എന്ത് 14569 +ഇത് 2045 +''' + +if len(sys.argv) != 4: + print(desc) + sys.exit(0) + +db = sys.argv[1] +file = sys.argv[2] +outputFile = sys.argv[3] + +con = sqlite3.connect(db) +cur = con.cursor() + +cur.execute("SELECT pattern, value1 FROM symbols WHERE pattern IN (SELECT pattern from symbols GROUP by pattern HAVING COUNT(pattern) > 1)") +patternsAndSymbols = cur.fetchall() +symbols = [] +for s in patternsAndSymbols: + symbols.append(s[1]) + +freqs = {} + + +def add(char, frequency): + if char in freqs: + freqs[char] += int(frequency) + else: + freqs[char] = int(frequency) + # print("Incremented %s - %s" % (char, frequency)) + + +base = 0 +with open(file, "r", encoding="utf8", errors='ignore') as f: + processedCount = 0 + for line in f: + word, frequency = line.split(" ") + + i = 0 + sequence = "" + while i < len(word): + sequence += word[i] + + if sequence not in symbols: + # backtrack + if sequence[0:-1] in symbols: + add(sequence[0:-1], frequency) + sequence = sequence[-1] + else: + if i == len(word) - 1: + # Last character + add(sequence, frequency) + + i += 1 + + processedCount += 1 + + if processedCount % 30 == 0: + print("Processed " + str(processedCount) + " words") + +freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True)) + +with open(outputFile, 'a+') as out: + for grapheme, weight in freqs.items(): + outLine = grapheme + " " + str(weight) + out.write(outLine + '\n') diff --git a/govarnam/schemes/scripts/symbol-weight-update-in-vst.py b/govarnam/schemes/scripts/symbol-weight-update-in-vst.py new file mode 100644 index 0000000..94ab38c --- /dev/null +++ b/govarnam/schemes/scripts/symbol-weight-update-in-vst.py @@ -0,0 +1,76 @@ +import sqlite3 +import sys + +desc = ''' +Usage: script.py scheme.vst symbolFrequencyReport.txt + +INPUT FILE MUST BE SYMBOL FREQUENCY REPORT. FORMAT: +symbolfrequency +സ 2045414 +യെ 1456 +''' + +if len(sys.argv) != 3: + print(desc) + sys.exit(0) + +db = sys.argv[1] +file = sys.argv[2] + +con = sqlite3.connect(db) +cur = con.cursor() + +cur.execute("SELECT pattern, value1, type FROM symbols WHERE match_type = 2 AND pattern IN (SELECT pattern from symbols GROUP by pattern HAVING COUNT(pattern) > 1)") +patternsAndSymbols = cur.fetchall() + +freqs = {} +with open(file, "r", encoding="utf8", errors='ignore') as f: + for line in f: + symbol, frequency = line.split(" ") + freqs[symbol] = int(frequency) + +patternAndSymbols = {} +for pattern, symbol, symbolType in patternsAndSymbols: + if pattern not in patternAndSymbols: + patternAndSymbols[pattern] = [( + symbol, + symbolType, + freqs[symbol] if symbol in freqs else 0 + )] + else: + patternAndSymbols[pattern].append(( + symbol, + symbolType, + freqs[symbol] if symbol in freqs else 0 + )) + +for pattern, symbols in patternAndSymbols.items(): + CONSONANT = 2 # ണ്ട + CONSONANT_VOWEL = 4 # ണ്ടാ + + ranks = {} + + # Find the consonant with least frequency value + minConsonantFrequency = 100 + maxConsonantVowelFrequency = 1 + for symbol, symbolType, frequency in symbols: + if symbolType == CONSONANT and minConsonantFrequency > frequency: + minConsonantFrequency = frequency + + if symbolType == CONSONANT_VOWEL and maxConsonantVowelFrequency < frequency: + maxConsonantVowelFrequency = frequency + + if symbolType != CONSONANT_VOWEL: + ranks[symbol] = frequency + + for symbol, symbolType, frequency in symbols: + if symbolType == CONSONANT_VOWEL: + ranks[symbol] = int((frequency / maxConsonantVowelFrequency) * (minConsonantFrequency / 2)) + + ranks = dict(sorted(ranks.items(), key=lambda item: item[1], reverse=True)) + + for symbol, rank in ranks.items(): + cur.execute("UPDATE symbols SET weight = ? WHERE pattern = ? AND value1 = ?", (rank, pattern, symbol)) + + rank += 1 +con.commit() diff --git a/govarnam/schemes/ta/README.md b/govarnam/schemes/ta/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/ta/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/ta/import.sh b/govarnam/schemes/ta/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/ta/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/te/README.md b/govarnam/schemes/te/README.md new file mode 100644 index 0000000..e99cef2 --- /dev/null +++ b/govarnam/schemes/te/README.md @@ -0,0 +1,11 @@ +Installation +------------ + +1. Open a terminal from this folder (Right click -> Open terminal). +2. Type this and press enter: + +sudo ./install.sh install + +3. To import words, type this and press enter: + +./import.sh \ No newline at end of file diff --git a/govarnam/schemes/te/import.sh b/govarnam/schemes/te/import.sh new file mode 100644 index 0000000..2adb6da --- /dev/null +++ b/govarnam/schemes/te/import.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +if [[ "$EUID" == 0 ]]; then + msg="This script should NOT be run with sudo" + echo "$msg" + notify-send "$msg" &> /dev/null || true + exit +fi + +schemeID=$(ls $SCRIPT_DIR/*.vst) +schemeID=${schemeID/$SCRIPT_DIR\//} +schemeID=${schemeID/.vst/} + +# Install Language Packs + +for vlf in $SCRIPT_DIR/*/*.vlf; do + varnamcli -s $schemeID -import $vlf +done + +msg="Finished importing $schemeID language learnings" +echo "$msg" +notify-send "$msg" &> /dev/null || true diff --git a/govarnam/schemes/test/ml.rb b/govarnam/schemes/test/ml.rb new file mode 100644 index 0000000..7abe5db --- /dev/null +++ b/govarnam/schemes/test/ml.rb @@ -0,0 +1,46 @@ +# encoding: utf-8 + +class TestML < Minitest::Test + def setup + @varnam = get_varnam_handle('ml') + @varnam.config(Varnam::VARNAM_CONFIG_SET_TOKENIZER_SUGGESTIONS_LIMIT, 30) + end + + def test_words + list = { + 'peN' => 'പെൺ', + + # BEGIN Anusvara <-> ma complications + 'am_bEdkar' => 'അംബേദ്കർ', + 'manam_pOle' => 'മനംപോലെ', + 'kunnamkuLam' => 'കുന്നംകുളം', + 'pamkthi' => 'പംക്തി', + 'kambiyil' => 'കമ്പിയിൽ', + 'mvOnE' => 'മ്വോനേ', + 'mvOnoossE' => 'മ്വോനൂസ്സേ', + 'manushyan' => 'മനുഷ്യൻ', + 'mlEchcham' => 'മ്ലേച്ചം', + # END Anusvara <-> ma complications + + 'kiLivaathil' => 'കിളിവാതിൽ', + 'kiLivaathilil' => 'കിളിവാതിലിൽ', + 'thaazhvara' => 'താഴ്വര', + 'thaazh_vara' => 'താഴ്‌വര', + 'ANkiLi' => 'ആൺകിളി' + } + list.each do |pattern, expected| + # TODO assert length of result array too + assert_equal expected, @varnam.transliterate(pattern)[0].Word + end + end + + def test_reverse_transliteration + list = { + 'മലയാളം' => %w[malayaaLam malayaaLam_ malayALam malayALam_ malayaalam malayaalam_ malayAlam malayAlam_ malayaLam malayaLam_ malayalam malayalam_] + } + + list.each do |word, expected| + assert_equal expected, @varnam.reverse_transliterate(word) + end + end +end diff --git a/govarnam/schemes/test/run.rb b/govarnam/schemes/test/run.rb new file mode 100644 index 0000000..f98a147 --- /dev/null +++ b/govarnam/schemes/test/run.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby + +# encoding: utf-8 + +require_relative '../varnam' + +$handles = {} +def get_varnam_handle(scheme_id) + return $handles[scheme_id] if $handles[scheme_id] + + if find_govarnam.nil? + puts "Can't find govarnam shared library." + exit 1 + end + + learnings_file = Tempfile.new('learnings_file') + $handles[scheme_id] = VarnamInstance.new( + "#{__dir__}/../schemes/#{scheme_id}/#{scheme_id}.vst", + learnings_file.path + ) + + $handles[scheme_id] +end + +require "minitest/autorun" +require_relative './ml' diff --git a/govarnam/schemes/varnam.rb b/govarnam/schemes/varnam.rb new file mode 100644 index 0000000..4c45cc2 --- /dev/null +++ b/govarnam/schemes/varnam.rb @@ -0,0 +1,112 @@ +# encoding: utf-8 + +def gem_available?(name) + require name +rescue LoadError + false +end + +if not gem_available?('ffi') + puts "Can't find gem - ffi. To install run '[sudo] gem install ffi'" + exit(1) +end + +$library = nil +def find_govarnam + return $library if not $library.nil? + + # Trying to find out govarnam in the predefined locations if + # absolute path to the library is not specified + govarnam_search_paths = ['.', File.dirname(File.expand_path(__FILE__)), '/usr/local/lib', '/usr/local/lib/i386-linux-gnu', '/usr/local/lib/x86_64-linux-gnu', '/usr/lib/i386-linux-gnu', '/usr/lib/x86_64-linux-gnu', '/usr/lib'] + govarnam_names = ['libgovarnam.so', "libgovarnam.so.#{$govarnam_major_version}", 'libgovarnam.dylib', 'varnam.dll'] + govarnam_search_paths.each do |path| + govarnam_names.each do |fname| + fullpath = File.join(path, fname) + if File.exist?(fullpath) + $library = fullpath + return $library + end + end + end + nil +end + +def initialize_vst_maker_handle(vst_path) + require_relative './varnamruby.rb' + + varnam_handle_ptr = FFI::MemoryPointer.new :pointer + + initialized = VarnamLibrary.vm_init(vst_path, varnam_handle_ptr) + + varnam_handle = varnam_handle_ptr.read_int + + if initialized != 0 + msg = VarnamLibrary.varnam_get_last_error(varnam_handle) + puts "Varnam initialization failed #{msg}" + exit(1) + end + + varnam_handle +end + +class VarnamInstance + def initialize(vst_path, learnings_path) + require_relative './varnamruby' + + varnam_handle_ptr = FFI::MemoryPointer.new :pointer + VarnamLibrary.varnam_init(vst_path, learnings_path, varnam_handle_ptr) + @handle = varnam_handle_ptr.read_int + end + + def config(key, value) + configured = VarnamLibrary.varnam_config(@handle, key, value) + if configured != 0 + error_message = VarnamLibrary.varnam_get_last_error(@handle) + error error_message + return + end + true + end + + def transliterate(input) + arr_ptr = FFI::MemoryPointer.new :pointer + VarnamLibrary.varnam_transliterate( + @handle, + 1, + input, + arr_ptr + ) + + sugs = [] + size = VarnamLibrary.varray_length(arr_ptr.get_pointer(0)) + 0.upto(size - 1) do |i| + word_ptr = VarnamLibrary.varray_get(arr_ptr.get_pointer(0), i) + vsug = VarnamLibrary::Suggestion.new(word_ptr) + sugs.push( + Suggestion.new( + vsug[:Word].force_encoding('UTF-8'), + vsug[:Weight], + vsug[:LearnedOn] + ) + ) + end + sugs + end + + def reverse_transliterate(input) + arr_ptr = FFI::MemoryPointer.new :pointer + VarnamLibrary.varnam_reverse_transliterate( + @handle, + input, + arr_ptr + ) + + result = [] + size = VarnamLibrary.varray_length(arr_ptr.get_pointer(0)) + 0.upto(size - 1) do |i| + word_ptr = VarnamLibrary.varray_get(arr_ptr.get_pointer(0), i) + result.push(word_ptr.get_pointer(0).get_string(0).force_encoding('UTF-8')) + end + result + end +end diff --git a/govarnam/schemes/varnamruby.rb b/govarnam/schemes/varnamruby.rb new file mode 100644 index 0000000..4be9c21 --- /dev/null +++ b/govarnam/schemes/varnamruby.rb @@ -0,0 +1,182 @@ +## +# Copyright (C) Navaneeth.K.N +# +# This is part of libvarnam. See LICENSE.txt for the license +## + +require 'ffi' +require 'singleton' + +# Ruby wrapper for libvarnam +module VarnamLibrary + extend FFI::Library + ffi_lib $library + + class Symbol < FFI::Struct + layout :Identifier, :int, + :Type, :int, + :MatchType, :int, + :Pattern, :string, + :Value1, :string, + :Value2, :string, + :Value3, :string, + :Tag, :string, + :Weight, :int, + :Priority, :int, + :AcceptCondition, :int, + :Flags, :int + + def value1=(val) + pos = offset_of(:Value1) + if val.nil? + self.pointer.put_pointer(pos, FFI::MemoryPointer::NULL) + elsif val.is_a?(FFI::MemoryPointer) + self.pointer.put_pointer(pos, val) + else + fail("name= requires an FFI::MemoryPointer or nil") + end + end + end + + class SchemeDetails < FFI::Struct + layout :identifier, :pointer, + :langCode, :pointer, + :displayName, :pointer, + :author, :pointer, + :compiledDate, :pointer, + :isStable, :int + end + + class Suggestion < FFI::Struct + layout :Word, :string, + :Weight, :int, + :LearnedOn, :int + end + + attach_function :varnam_debug, [:int, :int], :void + attach_function :varnam_get_last_error, [:int], :string + attach_function :varnam_set_vst_lookup_dir, [:string], :int + attach_function :varnam_config, [:int, :int, :int], :int + + attach_function :varnam_new_search_symbol, [:pointer], :int + attach_function :varnam_search_symbol_table, [:int, :int, Symbol.by_value, :pointer], :int + + attach_function :varnam_init, [:string, :string, :pointer], :int + attach_function :varnam_transliterate, [:int, :int, :string, :pointer], :int + attach_function :varnam_reverse_transliterate, [:int, :string, :pointer], :int + + attach_function :vm_init, [:string, :pointer], :int + attach_function :vm_create_token, [:int, :string, :string, :string, :string, :string, :int, :int, :int, :int, :int], :int + attach_function :vm_delete_token, [:int, Symbol.by_value], :int + attach_function :vm_flush_buffer, [:int], :int + attach_function :vm_set_scheme_details, [:int, :pointer], :int + + attach_function :varray_get, [:pointer, :int], :pointer + attach_function :varray_length, [:pointer], :int +end + +VarnamSymbol = Struct.new(:type, :pattern, :value1, :value2, :value3, :tag, :match_type, :priority, :accept_condition, :flags, :weight) +Suggestion = Struct.new(:Word, :Weight, :LearnedOn) +VarnamSchemeDetails = Struct.new(:langCode, :identifier, :displayName, :author, :compiledDate, :isStable) + +module Varnam + VARNAM_SYMBOL_VOWEL = 1 + VARNAM_SYMBOL_CONSONANT = 2 + VARNAM_SYMBOL_DEAD_CONSONANT = 3 + VARNAM_SYMBOL_CONSONANT_VOWEL = 4 + VARNAM_SYMBOL_NUMBER = 5 + VARNAM_SYMBOL_SYMBOL = 6 + VARNAM_SYMBOL_ANUSVARA = 7 + VARNAM_SYMBOL_VISARGA = 8 + VARNAM_SYMBOL_VIRAMA = 9 + VARNAM_SYMBOL_OTHER = 10 + VARNAM_SYMBOL_NON_JOINER = 11 + VARNAM_SYMBOL_JOINER = 12 + VARNAM_SYMBOL_PERIOD = 13 + + VARNAM_MATCH_EXACT = 1 + VARNAM_MATCH_POSSIBILITY = 2 + + VARNAM_CONFIG_USE_DEAD_CONSONANTS = 100 + VARNAM_CONFIG_IGNORE_DUPLICATE_TOKEN = 101 + VARNAM_CONFIG_ENABLE_SUGGESTIONS = 102 + VARNAM_CONFIG_USE_INDIC_DIGITS = 103 + VARNAM_CONFIG_SET_DICTIONARY_SUGGESTIONS_LIMIT = 104 + VARNAM_CONFIG_SET_PATTERN_DICTIONARY_SUGGESTIONS_LIMIT = 105 + VARNAM_CONFIG_SET_TOKENIZER_SUGGESTIONS_LIMIT = 106 + VARNAM_CONFIG_SET_DICTIONARY_MATCH_EXACT = 107 + + VARNAM_TOKEN_PRIORITY_HIGH = 1 + VARNAM_TOKEN_PRIORITY_NORMAL = 0 + VARNAM_TOKEN_PRIORITY_LOW = -1 + + VARNAM_TOKEN_ACCEPT_ALL = 0 + VARNAM_TOKEN_ACCEPT_IF_STARTS_WITH = 1 + VARNAM_TOKEN_ACCEPT_IF_IN_BETWEEN = 2 + VARNAM_TOKEN_ACCEPT_IF_ENDS_WITH = 3 + + VARNAM_LOG_DEFAULT = 1 + VARNAM_LOG_DEBUG = 2 + + class RuntimeContext + include Singleton + + def initialize + @errors = 0 + @warnings = 0 + @tokens = {} + @current_expression = "" + @error_messages = [] + @warning_messages = [] + @current_tag = nil + end + + def errored + @errors += 1 + end + + def warned + @warnings += 1 + end + + def errors + @errors + end + + def warnings + @warnings + end + + attr_accessor :tokens, :current_expression, :error_messages, :warning_messages, :current_tag + end +end + +def _context + return Varnam::RuntimeContext.instance +end + +def get_source_file_with_linenum + Kernel::caller.last.sub(":in `
'", "") # We don't need :in `
' to appear and make confusion +end + +def inform(message) + puts " #{message}" +end + +def warn(message) + if _context.current_expression.nil? + _context.warning_messages.push "#{get_source_file_with_linenum} : WARNING: #{message}" + else + _context.warning_messages.push "#{get_source_file_with_linenum} : WARNING: In expression #{_context.current_expression}. #{message}" + end + _context.warned +end + +def error(message) + if _context.current_expression.nil? + _context.error_messages.push "#{get_source_file_with_linenum} : ERROR : #{message}" + else + _context.error_messages.push "#{get_source_file_with_linenum} : ERROR : In expression #{_context.current_expression}. #{message}" + end + _context.errored +end diff --git a/rust/composition_processor/src/engine.rs b/rust/composition_processor/src/engine.rs index 6fc3a3b..90b5ba9 100644 --- a/rust/composition_processor/src/engine.rs +++ b/rust/composition_processor/src/engine.rs @@ -13,6 +13,21 @@ use windows::{ Input::KeyboardAndMouse::VK_SHIFT, TextServices::{ITfThreadMgr, TF_LBI_STATUS_DISABLED, TF_LBI_STATUS_HIDDEN}, }, + // System::SystemServices::{ + // LANG_MALAYALAM, + // LANG_ASSAMESE, + // LANG_MARATHI, + // LANG_BENGALI, + // LANG_NEPALI, + // LANG_GUJARATI, + // LANG_ODIA, + // LANG_HINDI, + // LANG_PUNJABI, + // LANG_KANNADA, + // LANG_SANSKRIT, + // LANG_TAMIL, + // LANG_TELUGU, + // }, }, }; @@ -39,9 +54,6 @@ use once_cell::sync::Lazy; use govarnam::Varnam; static VARNAM: Lazy = Lazy::new(|| { - // DEBUG - // This should be adaptive to language switches - // Currently using Malayalam VST for debugging let dll_instance_handle = unsafe { ime::dll::DLL_INSTANCE }; let file_name = unsafe { @@ -52,6 +64,23 @@ static VARNAM: Lazy = Lazy::new(|| { let dir = std::path::Path::new(&file_name[..]).parent().unwrap(); + // let (scheme_path, learning_path) = match active_langid as u32 { + // LANG_MALAYALAM => (dir.join("schemes/ml/ml.vst"), dir.join("schemes/learnings/ml.vst.learnings")), // Malayalam + // LANG_ASSAMESE => (dir.join("schemes/as/as.vst"), dir.join("schemes/learnings/as.vst.learnings")), // Assamese + // LANG_MARATHI => (dir.join("schemes/mr/mr.vst"), dir.join("schemes/learnings/mr.vst.learnings")), // Marathi + // LANG_BENGALI => (dir.join("schemes/bn/bn.vst"), dir.join("schemes/learnings/bn.vst.learnings")), // Bengali + // LANG_NEPALI => (dir.join("schemes/ne/ne.vst"), dir.join("schemes/learnings/ne.vst.learnings")), // Nepali + // LANG_GUJARATI => (dir.join("schemes/gu/gu.vst"), dir.join("schemes/learnings/gu.vst.learnings")), // Gujarati + // LANG_ODIA => (dir.join("schemes/or/or.vst"), dir.join("schemes/learnings/or.vst.learnings")), // Odia + // LANG_HINDI => (dir.join("schemes/hi/hi.vst"), dir.join("schemes/learnings/hi.vst.learnings")), // Hindi + // LANG_PUNJABI => (dir.join("schemes/pa/pa.vst"), dir.join("schemes/learnings/pa.vst.learnings")), // Punjabi + // LANG_KANNADA => (dir.join("schemes/kn/kn.vst"), dir.join("schemes/learnings/kn.vst.learnings")), // Kannada + // LANG_SANSKRIT => (dir.join("schemes/sa/sa.vst"), dir.join("schemes/learnings/sa.vst.learnings")), // Sanskrit + // LANG_TAMIL => (dir.join("schemes/ta/ta.vst"), dir.join("schemes/learnings/ta.vst.learnings")), // Tamil + // LANG_TELUGU => (dir.join("schemes/te/te.vst"), dir.join("schemes/learnings/te.vst.learnings")), // Telugu + // _ => panic!("Unsupported language ID: {}", active_langid), // Panic for unsupported languages + // }; + let scheme_path = dir.join("schemes/ml/ml.vst"); let learning_path = dir.join("schemes/learnings/ml.vst.learnings"); @@ -109,10 +138,6 @@ impl CompositionProcessorEngine { .init(thread_mgr, client_id, &self.compartment_wrapper) .ok(); unsafe { ime::font::set_default_candidate_text_font() }; - // self.setup_dictionary_file( - // unsafe { ime::dll::DLL_INSTANCE }, - // ime::resources::TEXTSERVICE_DIC, - // ); true } @@ -139,6 +164,42 @@ impl CompositionProcessorEngine { // let results: Vec<&str> = Vec::from(["stuff", "stuff", "stuff", "stuff"]); + let current_language = self.language_bar.get_active_langid(); + + if let Ok(lang_id) = current_language { + use std::io::prelude::*; + + let mut file = std::fs::OpenOptions::new() + .write(true) + .append(true) + .open("C:\\Users\\doxop\\Documents\\debug.txt") + .unwrap(); + + if let Err(e) = writeln!(file, "Language ID: {}", lang_id) { + eprintln!("Couldn't write to file: {}", e); + let mut error_file = std::fs::OpenOptions::new() + .write(true) + .append(true) + .open("C:\\Users\\doxop\\Documents\\debug.txt") + .unwrap(); + if let Err(e) = writeln!(error_file, "Error: {}", e) { + eprintln!("Couldn't write error to file: {}", e); + } + } + } else { + use std::io::prelude::*; + + let mut file = std::fs::OpenOptions::new() + .write(true) + .append(true) + .open("C:\\Users\\doxop\\Documents\\debug.txt") + .unwrap(); + + if let Err(e) = writeln!(file, "Error: Failed to get active language ID") { + eprintln!("Couldn't write error to file: {}", e); + } + } + for result in results { matches.push((keystroke_buffer.clone(), result.to_string())) } diff --git a/rust/composition_processor/src/engine/language_bar.rs b/rust/composition_processor/src/engine/language_bar.rs index ad438db..daf907f 100644 --- a/rust/composition_processor/src/engine/language_bar.rs +++ b/rust/composition_processor/src/engine/language_bar.rs @@ -1,7 +1,7 @@ use std::ffi::c_void; use globals::{ - SAMPLEIME_GUID_COMPARTMENT_DOUBLE_SINGLE_BYTE, SAMPLEIME_GUID_COMPARTMENT_PUNCTUATION, + SAMPLEIME_GUID_PROFILE, SAMPLEIME_GUID_COMPARTMENT_DOUBLE_SINGLE_BYTE, SAMPLEIME_GUID_COMPARTMENT_PUNCTUATION, }; use ime::resources::{ IME_MODE_DESCRIPTION, IME_MODE_OFF_ICO_INDEX, IME_MODE_ON_ICO_INDEX, @@ -13,11 +13,12 @@ use itf_components::{ use windows::{ core::ComInterface, Win32::UI::TextServices::{ - ITfCompartmentEventSink, ITfLangBarItemButton, ITfThreadMgr, + ITfCompartmentEventSink, ITfLangBarItemButton, ITfThreadMgr, ITfInputProcessorProfileMgr, TF_INPUTPROCESSORPROFILE, GUID_COMPARTMENT_KEYBOARD_INPUTMODE_CONVERSION, GUID_COMPARTMENT_KEYBOARD_OPENCLOSE, - GUID_LBI_INPUTMODE, + GUID_LBI_INPUTMODE, CLSID_TF_InputProcessorProfiles }, }; +use ime::com::create_instance_inproc; use super::compartment_update_listener::{compartment_callback, CompartmentUpdateListener}; @@ -103,6 +104,18 @@ impl LanguageBar { Ok(()) } + + pub fn get_active_langid(&self) -> windows::core::Result { + let profile_manager: ITfInputProcessorProfileMgr = + create_instance_inproc(&CLSID_TF_InputProcessorProfiles)?; + + let mut profile: TF_INPUTPROCESSORPROFILE = Default::default(); + unsafe { + profile_manager.GetActiveProfile(&SAMPLEIME_GUID_PROFILE, &mut profile)?; + } + + Ok(profile.langid) + } } impl Drop for LanguageBar { diff --git a/rust/ime/src/dll.rs b/rust/ime/src/dll.rs index 4684b83..a821d36 100644 --- a/rust/ime/src/dll.rs +++ b/rust/ime/src/dll.rs @@ -3,6 +3,22 @@ use windows::Win32::Foundation::{E_FAIL, HMODULE, S_OK}; use crate::registry; +use windows::Win32::System::SystemServices::{ + LANG_MALAYALAM, SUBLANG_MALAYALAM_INDIA, + LANG_ASSAMESE, SUBLANG_ASSAMESE_INDIA, + LANG_MARATHI, SUBLANG_MARATHI_INDIA, + LANG_BENGALI, SUBLANG_BENGALI_INDIA, + LANG_NEPALI, SUBLANG_NEPALI_INDIA, + LANG_GUJARATI, SUBLANG_GUJARATI_INDIA, + LANG_ODIA, SUBLANG_ODIA_INDIA, + LANG_HINDI, SUBLANG_HINDI_INDIA, + LANG_PUNJABI, SUBLANG_PUNJABI_INDIA, + LANG_KANNADA, SUBLANG_KANNADA_INDIA, + LANG_SANSKRIT, SUBLANG_SANSKRIT_INDIA, + LANG_TAMIL, SUBLANG_TAMIL_INDIA, + LANG_TELUGU, SUBLANG_TELUGU_INDIA, +}; + #[no_mangle] pub static mut DLL_INSTANCE: HMODULE = HMODULE(0); @@ -11,10 +27,31 @@ pub static mut DLL_INSTANCE: HMODULE = HMODULE(0); #[doc(hidden)] unsafe extern "system" fn DllRegisterServer() -> HRESULT { unsafe fn register() -> windows::core::Result<()> { + let lang_ids = [ + (SUBLANG_MALAYALAM_INDIA << 10 | LANG_MALAYALAM) as u16, + (SUBLANG_ASSAMESE_INDIA << 10 | LANG_ASSAMESE) as u16, + (SUBLANG_MARATHI_INDIA << 10 | LANG_MARATHI) as u16, + (SUBLANG_BENGALI_INDIA << 10 | LANG_BENGALI) as u16, + (SUBLANG_NEPALI_INDIA << 10 | LANG_NEPALI) as u16, + (SUBLANG_GUJARATI_INDIA << 10 | LANG_GUJARATI) as u16, + (SUBLANG_ODIA_INDIA << 10 | LANG_ODIA) as u16, + (SUBLANG_HINDI_INDIA << 10 | LANG_HINDI) as u16, + (SUBLANG_PUNJABI_INDIA << 10 | LANG_PUNJABI) as u16, + (SUBLANG_KANNADA_INDIA << 10 | LANG_KANNADA) as u16, + (SUBLANG_SANSKRIT_INDIA << 10 | LANG_SANSKRIT) as u16, + (SUBLANG_TAMIL_INDIA << 10 | LANG_TAMIL) as u16, + (SUBLANG_TELUGU_INDIA << 10 | LANG_TELUGU) as u16, + ]; + registry::register_server(DLL_INSTANCE) .map_err(|_| windows::core::Error::new(E_FAIL, "Failed to register server".into()))?; - registry::register_profile(DLL_INSTANCE)?; + + for lang_id in lang_ids.iter() { + registry::register_profile(DLL_INSTANCE, *lang_id)?; + } + registry::register_categories()?; + Ok(()) } @@ -30,8 +67,28 @@ unsafe extern "system" fn DllRegisterServer() -> HRESULT { #[allow(non_snake_case)] #[doc(hidden)] unsafe extern "system" fn DllUnregisterServer() -> HRESULT { - registry::unregister_profile().ok(); + let lang_ids = [ + (SUBLANG_MALAYALAM_INDIA << 10 | LANG_MALAYALAM) as u16, + (SUBLANG_ASSAMESE_INDIA << 10 | LANG_ASSAMESE) as u16, + (SUBLANG_MARATHI_INDIA << 10 | LANG_MARATHI) as u16, + (SUBLANG_BENGALI_INDIA << 10 | LANG_BENGALI) as u16, + (SUBLANG_NEPALI_INDIA << 10 | LANG_NEPALI) as u16, + (SUBLANG_GUJARATI_INDIA << 10 | LANG_GUJARATI) as u16, + (SUBLANG_ODIA_INDIA << 10 | LANG_ODIA) as u16, + (SUBLANG_HINDI_INDIA << 10 | LANG_HINDI) as u16, + (SUBLANG_PUNJABI_INDIA << 10 | LANG_PUNJABI) as u16, + (SUBLANG_KANNADA_INDIA << 10 | LANG_KANNADA) as u16, + (SUBLANG_SANSKRIT_INDIA << 10 | LANG_SANSKRIT) as u16, + (SUBLANG_TAMIL_INDIA << 10 | LANG_TAMIL) as u16, + (SUBLANG_TELUGU_INDIA << 10 | LANG_TELUGU) as u16, + ]; + + for lang_id in lang_ids.iter() { + registry::unregister_profile(*lang_id).ok(); + } + registry::unregister_categories().ok(); registry::unregister_server().ok(); + S_OK } diff --git a/rust/ime/src/lib.rs b/rust/ime/src/lib.rs index 04d275e..9d99f16 100644 --- a/rust/ime/src/lib.rs +++ b/rust/ime/src/lib.rs @@ -1,4 +1,4 @@ -mod com; +pub mod com; pub mod dll; pub mod font; pub mod icon; diff --git a/rust/ime/src/registry.rs b/rust/ime/src/registry.rs index a15fded..13f8337 100644 --- a/rust/ime/src/registry.rs +++ b/rust/ime/src/registry.rs @@ -6,7 +6,6 @@ use crate::com::create_instance_inproc; use windows::Win32::{ Foundation::{HMODULE, MAX_PATH}, System::LibraryLoader::GetModuleFileNameW, - System::SystemServices::{LANG_MALAYALAM, SUBLANG_MALAYALAM_INDIA}, UI::TextServices::{ CLSID_TF_CategoryMgr, CLSID_TF_InputProcessorProfiles, ITfCategoryMgr, ITfInputProcessorProfileMgr, GUID_TFCAT_DISPLAYATTRIBUTEPROVIDER, @@ -18,10 +17,7 @@ use windows::Win32::{ }; const TEXTSERVICE_DESC: &str = "Varnam Windows"; -// MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED) -// const TEXTSERVICE_LANGID: u16 = (SUBLANG_CHINESE_SIMPLIFIED << 10 | LANG_CHINESE) as u16; -const TEXTSERVICE_LANGID: u16 = (SUBLANG_MALAYALAM_INDIA << 10 | LANG_MALAYALAM) as u16; -// #define TEXTSERVICE_ICON_INDEX -IDIS_SAMPLEIME + const TEXTSERVICE_ICON_INDEX: u32 = -12i32 as u32; fn get_module_file_name(dll_instance_handle: HMODULE) -> String { @@ -32,7 +28,7 @@ fn get_module_file_name(dll_instance_handle: HMODULE) -> String { } } -pub fn register_profile(dll_instance_handle: HMODULE) -> windows::core::Result<()> { +pub fn register_profile(dll_instance_handle: HMODULE, testservice_langid: u16) -> windows::core::Result<()> { let profile_manager: ITfInputProcessorProfileMgr = create_instance_inproc(&CLSID_TF_InputProcessorProfiles)?; @@ -45,7 +41,7 @@ pub fn register_profile(dll_instance_handle: HMODULE) -> windows::core::Result<( unsafe { profile_manager.RegisterProfile( &SAMPLEIME_CLSID, - TEXTSERVICE_LANGID, + testservice_langid, &SAMPLEIME_GUID_PROFILE, &description, &icon_file_name, @@ -60,14 +56,14 @@ pub fn register_profile(dll_instance_handle: HMODULE) -> windows::core::Result<( Ok(()) } -pub fn unregister_profile() -> Result<(), windows::core::Error> { +pub fn unregister_profile(testservice_langid: u16) -> Result<(), windows::core::Error> { let profile_manager: ITfInputProcessorProfileMgr = create_instance_inproc(&CLSID_TF_InputProcessorProfiles)?; unsafe { profile_manager.UnregisterProfile( &SAMPLEIME_CLSID, - TEXTSERVICE_LANGID, + testservice_langid, &SAMPLEIME_GUID_PROFILE, 0, )?;