Skip to content

Commit

Permalink
search: change the seed-chaining function
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed May 14, 2024
1 parent 6ebe6e2 commit 4d95e6d
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@


*.directory
.Rhistory

*ssshtest
*.nextflow.log*
*.brename_detail.txt
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
- Change the format of k-mer-value index file, and fix the computation of index partitions.
- Optionally save seed positions which can be outputted by `lexicmap utils seed-pos`.
- `lexicmap search`:
- **Improved seed-chaining algorithm**.
- **Better support of long queries**.
- **Add a new flag `-w/--load-whole-seeds` for loading the whole seed data into memory for faster search**.
- **Parallelize alignment in each query**, so it's faster for a single query.
- **Optional outputing matched query and subject sequences**.
- 2-5X searching speed with a faster masking method.
- Change output format.
- Add output of query start and end positions.
- Fix a seed-chaining bug.
- Fix a target sequence extracting bug.
- Keep indexes of genome data in memory.
- `lexicmap utils kmers`:
Expand Down
2 changes: 1 addition & 1 deletion docs/content/releases/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ weight: 30
- Change the format of k-mer-value index file, and fix the computation of index partitions.
- Optionally save seed positions which can be outputted by `lexicmap utils seed-pos`.
- `lexicmap search`:
- **Improved seed-chaining algorithm**.
- **Better support of long queries**.
- **Add a new flag `-w/--load-whole-seeds` for loading the whole seed data into memory for faster search**.
- **Parallelize alignment in each query**, so it's faster for a single query.
- **Optional outputing matched query and subject sequences**.
- 2-5X searching speed with a faster masking method.
- Change output format.
- Add output of query start and end positions.
- Fix a seed-chaining bug.
- Fix a target sequence extracting bug.
- Keep indexes of genome data in memory.
- `lexicmap utils kmers`:
Expand Down
1 change: 1 addition & 0 deletions docs/static/main-a559ecaa.min.css

Large diffs are not rendered by default.

36 changes: 32 additions & 4 deletions lexicmap/cmd/lib-chaining.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ type Chainer struct {
// scores []float64 // actually, it's not necessary
maxscores []float64
maxscoresIdxs []int
directions []float64
visited []bool
}

Expand All @@ -57,6 +58,7 @@ func NewChainer(options *ChainingOptions) *Chainer {
// scores: make([]float64, 0, 128),
maxscores: make([]float64, 0, 10240),
maxscoresIdxs: make([]int, 0, 10240),
directions: make([]float64, 0, 10240),
visited: make([]bool, 0, 10240),
}
return c
Expand Down Expand Up @@ -119,6 +121,8 @@ func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int, float64) {
// index of previous seed, the size is n
maxscoresIdxs := &ce.maxscoresIdxs
*maxscoresIdxs = (*maxscoresIdxs)[:0]
directions := &ce.directions
*directions = (*directions)[:0]
// for i = 0; i < n; i++ {
// maxscores = append(maxscores, 0)
// maxscoresIdxs = append(maxscoresIdxs, 0)
Expand All @@ -131,22 +135,30 @@ func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int, float64) {
// }
*maxscores = append(*maxscores, seedWeight(float64((*subs)[0].Len)))
*maxscoresIdxs = append(*maxscoresIdxs, 0)
*directions = append(*directions, 1)

// compute scores
var s, m, d, g float64
var dir, mdir float64
var a, b *SubstrPair
maxGap := ce.options.MaxGap
maxDistance := ce.options.MaxDistance
var _first bool
for i = 1; i < n; i++ {
a = (*subs)[i]

// just initialize the max score, which comes from the current seed
// m = scores[k]
m, mj = seedWeight(float64(a.Len)), i
m, mj, mdir = seedWeight(float64(a.Len)), i, 1

_first = true
for j = 0; j < i; j++ { // try all previous seeds, no bound
b = (*subs)[j]

if a.QBegin == b.QBegin || a.TBegin == b.TBegin {
continue
}

d = distance(a, b)
if d > maxDistance {
continue
Expand All @@ -157,21 +169,30 @@ func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int, float64) {
continue
}

s = (*maxscores)[j] + seedWeight(float64(b.Len)) - distanceScore(d) - gapScore(g)
// scores[k] = s
// s = (*maxscores)[j] + seedWeight(float64(b.Len)) - distanceScore(d) - gapScore(g)

dir = direction(a, b)
if _first {
s = (*maxscores)[j] + seedWeight(float64(b.Len)) - gapScore(g)
_first = false
} else {
s = (*maxscores)[j] + seedWeight(float64(b.Len))*(*directions)[j]*dir - gapScore(g)
}

if s >= m { // update the max score
m = s
mj = j
mdir = dir
}
}
*maxscores = append(*maxscores, m) // save the max score
*maxscoresIdxs = append(*maxscoresIdxs, mj)
*directions = append(*directions, mdir)
}
// print the score matrix
// fmt.Printf("i\tpair-i\tiMax\tj:scores\n")
// for i = 0; i < n; i++ {
// fmt.Printf("%d\t%s\t%d:%.3f\n", i, (*subs)[i], (*maxscoresIdxs)[i], (*maxscores)[i])
// fmt.Printf("%d\t%s\t%d:%.0f:%.3f\n", i, (*subs)[i], (*maxscoresIdxs)[i], (*directions)[i], (*maxscores)[i])
// }

// backtrack
Expand Down Expand Up @@ -265,6 +286,13 @@ func distanceScore(d float64) float64 {
return 0.01 * d
}

func direction(a, b *SubstrPair) float64 {
if a.TBegin >= b.TBegin {
return 1
}
return -1
}

func gap(a, b *SubstrPair) float64 {
return math.Abs(math.Abs(float64(a.QBegin-b.QBegin)) - math.Abs(float64(a.TBegin-b.TBegin)))
}
Expand Down

0 comments on commit 4d95e6d

Please sign in to comment.