Skip to content

Commit

Permalink
Merge pull request #15 from snapp-incubator/feat/exact-transform
Browse files Browse the repository at this point in the history
exact transform words
  • Loading branch information
sepehrsoh authored Feb 8, 2025
2 parents cc37be7 + 20851a5 commit 31ab7b5
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 3 deletions.
9 changes: 8 additions & 1 deletion internal/lookup_compound.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func (s *SymSpell) LookupCompound(phrase string, maxEditDistance int) *items.Sug
isLastCombi: false,
}
for i := range terms1 {
cp.terms1 = terms1[i]
cp.terms1 = s.replaceExactMatch(terms1[i])
s.getSuggestion(&cp, maxEditDistance)
// Combine adjacent terms
if i > 0 && !cp.isLastCombi {
Expand Down Expand Up @@ -214,6 +214,13 @@ func (s *SymSpell) finalizeAnswer(phrase string, suggestionParts []items.Suggest
}
}

func (s *SymSpell) replaceExactMatch(phrase string) string {
if result, found := s.ExactTransform[phrase]; found {
return result
}
return phrase
}

func createWithProbability(term string, distance int) items.SuggestItem {
// Calculate Naive Bayes probability as the count
probabilityCount := int(10 / math.Pow(10, float64(len(term))))
Expand Down
49 changes: 49 additions & 0 deletions internal/symspell.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package internal
import (
"bufio"
"errors"
"fmt"
"log"
"math"
"os"
Expand All @@ -27,6 +28,7 @@ type SymSpell struct {
Words map[string]int
BelowThresholdWords map[string]int
Deletes map[string][]string
ExactTransform map[string]string
maxLength int
distanceComparer editdistance.IEditDistance
// lookup compound
Expand Down Expand Up @@ -66,6 +68,7 @@ func NewSymSpell(opt ...options.Options) (*SymSpell, error) {
Words: make(map[string]int),
BelowThresholdWords: make(map[string]int),
Deletes: make(map[string][]string),
ExactTransform: make(map[string]string),
distanceComparer: editdistance.NewEditDistance(editdistance.DamerauLevenshtein), // todo add more edit distance algorithms
maxLength: 0,
Bigrams: make(map[string]int),
Expand Down Expand Up @@ -210,3 +213,49 @@ func incrementCount(count, countPrevious int) int {
}
return math.MaxInt64
}

func (s *SymSpell) LoadExactDictionary(
corpusPath string,
separator string,
) (bool, error) {
if corpusPath == "" {
return false, fmt.Errorf("corpus path cannot be empty")
}
// Check if the file exists
file, err := os.Open(corpusPath)
if err != nil {
return false, err
}
defer file.Close()

// Use the stream-based loading function
return s.LoadExactDictionaryStream(file, separator), nil
}

func (s *SymSpell) LoadExactDictionaryStream(corpusStream *os.File, separator string) bool {
scanner := bufio.NewScanner(corpusStream)
// Define minimum parts depending on the separator
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
// Split line by the separator
var parts []string
if separator == "" {
parts = strings.Fields(line)
} else {
parts = strings.Split(line, separator)
}
if len(parts) < 2 {
continue
}
// Parse count
exactMatch := parts[1]
// Create the key
key := parts[0]
// Add to Exact Transform dictionary
s.ExactTransform[key] = exactMatch
}
return true
}
8 changes: 8 additions & 0 deletions internal/tests/exact.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
خ خیابان
م میدان
کو کوی
پ پلاک
ک کوجه
میدون میدان
خیابون خیابان
تهرون تهران
9 changes: 8 additions & 1 deletion symspell.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func NewSymSpellWithLoadDictionary(dirPath string, termIndex, countIndex int, op
return symspell
}

func NewSymSpellWithLoadBigramDictionary(vocabDirPath, bigramDirPath string, termIndex, countIndex int, opt ...options.Options) SymSpell {
func NewSymSpellWithLoadBigramDictionary(vocabDirPath, bigramDirPath, exactDirPath string, termIndex, countIndex int, opt ...options.Options) SymSpell {
symspell := NewSymSpell(opt...)
ok, err := symspell.LoadDictionary(vocabDirPath, termIndex, countIndex, " ")
if err != nil || !ok {
Expand All @@ -42,6 +42,12 @@ func NewSymSpellWithLoadBigramDictionary(vocabDirPath, bigramDirPath string, ter
log.Println("[Error] ", err)
}
}
if exactDirPath != "" {
ok, err = symspell.LoadExactDictionary(exactDirPath, " ")
if err != nil || !ok {
log.Println("[Error] ", err)
}
}
return symspell
}

Expand All @@ -50,4 +56,5 @@ type SymSpell interface {
LookupCompound(phrase string, maxEditDistance int) *items.SuggestItem
LoadBigramDictionary(corpusPath string, termIndex, countIndex int, separator string) (bool, error)
LoadDictionary(corpusPath string, termIndex int, countIndex int, separator string) (bool, error)
LoadExactDictionary(corpusPath string, separator string) (bool, error)
}
27 changes: 26 additions & 1 deletion symspell_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func TestLookupCompound(t *testing.T) {
want: "secret plan",
},
}
symSpell := NewSymSpellWithLoadBigramDictionary("internal/tests/vocab.txt", "internal/tests/vocab_bigram.txt",
symSpell := NewSymSpellWithLoadBigramDictionary("internal/tests/vocab.txt", "internal/tests/vocab_bigram.txt", "",
0, 1,
options.WithCountThreshold(1),
options.WithMaxDictionaryEditDistance(3),
Expand Down Expand Up @@ -209,10 +209,35 @@ func TestSymspellLookupCompoundUnigram(t *testing.T) {
},
want: "خرداد 15",
},
{
name: "Exact Match 1",
args: args{
a: "م ازادی",
maxEditDistance: 3,
},
want: "میدان ازادی",
},
{
name: "Exact Match 2",
args: args{
a: "خ ازادی",
maxEditDistance: 3,
},
want: "خیابان ازادی",
},
{
name: "Exact Match 3",
args: args{
a: "تهران خ ازادی",
maxEditDistance: 3,
},
want: "تهران خیابان ازادی",
},
}
symSpell := NewSymSpellWithLoadBigramDictionary(
"internal/tests/vocab_fa.txt",
"internal/tests/vocab_bigram_fa.txt",
"internal/tests/exact.txt",
0,
1,
options.WithCountThreshold(0),
Expand Down

0 comments on commit 31ab7b5

Please sign in to comment.