diff --git a/internal/lookup_compound.go b/internal/lookup_compound.go index 20b95fb..b3a4e07 100644 --- a/internal/lookup_compound.go +++ b/internal/lookup_compound.go @@ -41,7 +41,7 @@ func (s *SymSpell) LookupCompound(phrase string, maxEditDistance int) *items.Sug isLastCombi: false, } for i := range terms1 { - cp.terms1 = terms1[i] + cp.terms1 = s.replaceExactMatch(terms1[i]) s.getSuggestion(&cp, maxEditDistance) // Combine adjacent terms if i > 0 && !cp.isLastCombi { @@ -214,6 +214,13 @@ func (s *SymSpell) finalizeAnswer(phrase string, suggestionParts []items.Suggest } } +func (s *SymSpell) replaceExactMatch(phrase string) string { + if result, found := s.ExactTransform[phrase]; found { + return result + } + return phrase +} + func createWithProbability(term string, distance int) items.SuggestItem { // Calculate Naive Bayes probability as the count probabilityCount := int(10 / math.Pow(10, float64(len(term)))) diff --git a/internal/symspell.go b/internal/symspell.go index 88cb10b..4f2f9d5 100644 --- a/internal/symspell.go +++ b/internal/symspell.go @@ -3,6 +3,7 @@ package internal import ( "bufio" "errors" + "fmt" "log" "math" "os" @@ -27,6 +28,7 @@ type SymSpell struct { Words map[string]int BelowThresholdWords map[string]int Deletes map[string][]string + ExactTransform map[string]string maxLength int distanceComparer editdistance.IEditDistance // lookup compound @@ -66,6 +68,7 @@ func NewSymSpell(opt ...options.Options) (*SymSpell, error) { Words: make(map[string]int), BelowThresholdWords: make(map[string]int), Deletes: make(map[string][]string), + ExactTransform: make(map[string]string), distanceComparer: editdistance.NewEditDistance(editdistance.DamerauLevenshtein), // todo add more edit distance algorithms maxLength: 0, Bigrams: make(map[string]int), @@ -210,3 +213,49 @@ func incrementCount(count, countPrevious int) int { } return math.MaxInt64 } + +func (s *SymSpell) LoadExactDictionary( + corpusPath string, + separator string, +) (bool, error) { + if corpusPath == "" { + return false, fmt.Errorf("corpus path cannot be empty") + } + // Check if the file exists + file, err := os.Open(corpusPath) + if err != nil { + return false, err + } + defer file.Close() + + // Use the stream-based loading function + return s.LoadExactDictionaryStream(file, separator), nil +} + +func (s *SymSpell) LoadExactDictionaryStream(corpusStream *os.File, separator string) bool { + scanner := bufio.NewScanner(corpusStream) + // Define minimum parts depending on the separator + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + // Split line by the separator + var parts []string + if separator == "" { + parts = strings.Fields(line) + } else { + parts = strings.Split(line, separator) + } + if len(parts) < 2 { + continue + } + // Parse count + exactMatch := parts[1] + // Create the key + key := parts[0] + // Add to Exact Transform dictionary + s.ExactTransform[key] = exactMatch + } + return true +} diff --git a/internal/tests/exact.txt b/internal/tests/exact.txt new file mode 100644 index 0000000..ae9336a --- /dev/null +++ b/internal/tests/exact.txt @@ -0,0 +1,8 @@ +خ خیابان +م میدان +کو کوی +پ پلاک +ک کوجه +میدون میدان +خیابون خیابان +تهرون تهران \ No newline at end of file diff --git a/symspell.go b/symspell.go index a8678bd..f078257 100644 --- a/symspell.go +++ b/symspell.go @@ -30,7 +30,7 @@ func NewSymSpellWithLoadDictionary(dirPath string, termIndex, countIndex int, op return symspell } -func NewSymSpellWithLoadBigramDictionary(vocabDirPath, bigramDirPath string, termIndex, countIndex int, opt ...options.Options) SymSpell { +func NewSymSpellWithLoadBigramDictionary(vocabDirPath, bigramDirPath, exactDirPath string, termIndex, countIndex int, opt ...options.Options) SymSpell { symspell := NewSymSpell(opt...) ok, err := symspell.LoadDictionary(vocabDirPath, termIndex, countIndex, " ") if err != nil || !ok { @@ -42,6 +42,12 @@ func NewSymSpellWithLoadBigramDictionary(vocabDirPath, bigramDirPath string, ter log.Println("[Error] ", err) } } + if exactDirPath != "" { + ok, err = symspell.LoadExactDictionary(exactDirPath, " ") + if err != nil || !ok { + log.Println("[Error] ", err) + } + } return symspell } @@ -50,4 +56,5 @@ type SymSpell interface { LookupCompound(phrase string, maxEditDistance int) *items.SuggestItem LoadBigramDictionary(corpusPath string, termIndex, countIndex int, separator string) (bool, error) LoadDictionary(corpusPath string, termIndex int, countIndex int, separator string) (bool, error) + LoadExactDictionary(corpusPath string, separator string) (bool, error) } diff --git a/symspell_test.go b/symspell_test.go index 17c43c4..b0a90cc 100644 --- a/symspell_test.go +++ b/symspell_test.go @@ -110,7 +110,7 @@ func TestLookupCompound(t *testing.T) { want: "secret plan", }, } - symSpell := NewSymSpellWithLoadBigramDictionary("internal/tests/vocab.txt", "internal/tests/vocab_bigram.txt", + symSpell := NewSymSpellWithLoadBigramDictionary("internal/tests/vocab.txt", "internal/tests/vocab_bigram.txt", "", 0, 1, options.WithCountThreshold(1), options.WithMaxDictionaryEditDistance(3), @@ -209,10 +209,35 @@ func TestSymspellLookupCompoundUnigram(t *testing.T) { }, want: "خرداد 15", }, + { + name: "Exact Match 1", + args: args{ + a: "م ازادی", + maxEditDistance: 3, + }, + want: "میدان ازادی", + }, + { + name: "Exact Match 2", + args: args{ + a: "خ ازادی", + maxEditDistance: 3, + }, + want: "خیابان ازادی", + }, + { + name: "Exact Match 3", + args: args{ + a: "تهران خ ازادی", + maxEditDistance: 3, + }, + want: "تهران خیابان ازادی", + }, } symSpell := NewSymSpellWithLoadBigramDictionary( "internal/tests/vocab_fa.txt", "internal/tests/vocab_bigram_fa.txt", + "internal/tests/exact.txt", 0, 1, options.WithCountThreshold(0),