Skip to content

Commit

Permalink
seed-pos: fix strands. index: better desert filling
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed May 13, 2024
1 parent ae79537 commit 6ebe6e2
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 40 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ docs/themes

demo/demo.lmi
demo/*.lexicmap.tsv.gz
demo/seed_distance.tsv
demo/seed_distance
demo/seed-pos.tsv.gz
demo/t.txt
Expand Down
40 changes: 20 additions & 20 deletions docs/content/usage/utils/seed-pos.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,21 +75,21 @@ Global Flags:
ref pos strand distance
--------------- --- ------ --------
GCF_000017205.1 2 + 1
GCF_000017205.1 41 + 39
GCF_000017205.1 41 - 39
GCF_000017205.1 45 + 4
GCF_000017205.1 74 + 29
GCF_000017205.1 85 + 11
GCF_000017205.1 119 + 34
GCF_000017205.1 130 + 11
GCF_000017205.1 74 - 29
GCF_000017205.1 85 - 11
GCF_000017205.1 119 - 34
GCF_000017205.1 130 - 11
GCF_000017205.1 185 + 55
GCF_000017205.1 269 + 84
GCF_000017205.1 269 - 84

Or only list records with seed distance longer than a threshold.

$ lexicmap utils seed-pos -d demo.lmi/ -n GCF_000017205.1 -D 1000 | csvtk pretty -t
ref pos strand distance
--------------- ------- ------ --------
GCF_000017205.1 5430137 + 1234
GCF_000017205.1 5430137 - 1235

Check the biggest seed distances.

Expand All @@ -100,7 +100,7 @@ Global Flags:

distance frequency
-------- ---------
1234 1
1235 1
899 2
898 4
897 1
Expand Down Expand Up @@ -136,7 +136,7 @@ Global Flags:
ref pos strand distance pre_pos len_aaa seq
--------------- --- ------ -------- ------- ------- ---------------------------------------
GCF_000017205.1 2 + 1 0 0 T
GCF_000017205.1 41 + 39 1 5 TAAAGAGACCGGCGATTCTAGTGAAATCGAACGGGCAGG
GCF_000017205.1 41 - 39 1 5 TAAAGAGACCGGCGATTCTAGTGAAATCGAACGGGCAGG
GCF_000017205.1 45 + 4 40 1 TCAA

Or only list records with seed distance longer than a threshold.
Expand All @@ -145,7 +145,7 @@ Global Flags:
| csvtk pretty -t -W 50 --clip
ref pos strand distance pre_pos len_aaa seq
--------------- ------- ------ -------- ------- ------- --------------------------------------------------
GCF_000017205.1 5430137 + 1234 5428902 4 ATCGGCGATCACGTTCAGCAGCGCCTTGGTGATGGTCAGGTTGTTGC.
GCF_000017205.1 5430137 - 1235 5428901 4 CATCGGCGATCACGTTCAGCAGCGCCTTGGTGATGGTCAGGTTGTTG...


3. Listing seed position of all genomes.
Expand All @@ -158,21 +158,21 @@ Global Flags:
$ csvtk freq -t -f ref -nr seed-pos.tsv.gz | csvtk pretty -t
ref frequency
--------------- ---------
GCF_000017205.1 45708
GCF_000017205.1 45711
GCF_002950215.1 43617
GCF_002949675.1 43411
GCF_002949675.1 43415
GCF_001457655.1 42112
GCF_006742205.1 42103
GCF_006742205.1 42102
GCF_900638025.1 42008
GCF_001027105.1 41856
GCF_000742135.1 41414
GCF_000392875.1 41392
GCF_001027105.1 41855
GCF_000742135.1 41415
GCF_000392875.1 41391
GCF_003697165.2 41189
GCF_009759685.1 41137
GCF_000006945.2 41108
GCF_009759685.1 41138
GCF_000006945.2 41110
GCF_000148585.2 41075
GCF_001096185.1 40239
GCF_001544255.1 40159
GCF_001096185.1 40226
GCF_001544255.1 40153

Plot the histograms of distances between seeds for all genomes.

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ require (
github.com/shenwei356/bio v0.13.3
github.com/shenwei356/go-logging v0.0.0-20171012171522-c6b9702d88ba
github.com/shenwei356/kmers v0.1.0
github.com/shenwei356/lexichash v0.3.0
github.com/shenwei356/lexichash v0.4.0
github.com/shenwei356/util v0.5.2
github.com/spf13/cobra v1.8.0
github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553
Expand Down
47 changes: 30 additions & 17 deletions lexicmap/cmd/lib-index-build.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"github.com/shenwei356/LexicMap/lexicmap/cmd/genome"
"github.com/shenwei356/LexicMap/lexicmap/cmd/kv"
"github.com/shenwei356/LexicMap/lexicmap/cmd/seedposition"
"github.com/shenwei356/LexicMap/lexicmap/cmd/util"
"github.com/shenwei356/bio/seqio/fastx"
"github.com/shenwei356/lexichash"
"github.com/shenwei356/lexichash/iterator"
Expand Down Expand Up @@ -600,7 +601,6 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,

// --------------------------------
// 1) parsing input genome files & mask & pack sequences
k := lh.K
nnn := bytes.Repeat([]byte{'N'}, opt.ContigInterval)
reRefName := opt.ReRefName
extractRefName := reRefName != nil
Expand All @@ -620,6 +620,9 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,
}()
startTime := time.Now()

k := lh.K
k8 := uint8(lh.K)

// --------------------------------
// read sequence

Expand Down Expand Up @@ -821,6 +824,9 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,
var hash, hashMin uint64
var knl *[]uint64

var lenPrefixSuffix uint8 = 1
ttt := uint64((1 << (lenPrefixSuffix << 1)) - 1)

extraLocs := poolInts.Get().(*[]int) // extra positions
*extraLocs = (*extraLocs)[:0]

Expand All @@ -835,6 +841,12 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,
continue
}

// there's a really big gap in it, it might be the interver between contigs or a assembly gap
if float64(lengthAAs(refseq.Seq[pre:pos]))/float64(d) >= 0.7 {
pre = pos
continue
}

// range of desert region +- 1000 bp,
// as we don't want other kmers with the same prefix exist around.
start = int(pre) - 1000 // start position in the sequence
Expand Down Expand Up @@ -896,11 +908,6 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,
*p2ks = append(*p2ks, p2k)
}

if (*counter)[0] > 50 { // it's in an interval region
pre = pos
continue
}

// fmt.Printf(" list size: %d\n", len(*p2ks))

// fmt.Printf(" posOfPre: %d, posOfCur: %d\n", posOfPre, posOfCur)
Expand All @@ -920,20 +927,22 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,

ok = false
for ; _j > _end; _j-- {
// fmt.Printf(" test %d\n", _j)
// fmt.Printf(" test u %d\n", _j)
p2k = (*p2ks)[_j]

// strand +
p, kmer = (*p2k)[0], (*p2k)[1]
if kmer != 0 && (*counter)[p] == 1 {
if kmer != 0 && (*counter)[p] == 1 &&
!util.MustKmerHasPrefix(kmer, 0, k8, lenPrefixSuffix) { // (gap AAAAAAAAAAAA) ACTGACTAG
kmerPos = uint64(start+_j) << 1
ok = true
break
}

// strand -
p, kmer = (*p2k)[2], (*p2k)[3]
if kmer != 0 && (*counter)[p] == 1 {
if kmer != 0 && (*counter)[p] == 1 &&
!util.MustKmerHasSuffix(kmer, ttt, k8, lenPrefixSuffix) { // (gap AAAAAAAAAAAA) ACTGACTAG
kmerPos = uint64(start+_j)<<1 | 1
ok = true
break
Expand Down Expand Up @@ -978,19 +987,21 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,
_end = posOfCur - 1
}
for _j = _start; _j < _end; _j++ {
// fmt.Printf(" test d %d\n", _j)
p2k = (*p2ks)[_j]

// strand +
p, kmer = (*p2k)[0], (*p2k)[1]
if kmer != 0 && (*counter)[p] == 1 {
if kmer != 0 && (*counter)[p] == 1 &&
!util.MustKmerHasPrefix(kmer, 0, k8, lenPrefixSuffix) { // (gap AAAAAAAAAAAA) ACTGACTAG
kmerPos = uint64(start+_j) << 1
ok = true
break
}

// strand -
p, kmer = (*p2k)[2], (*p2k)[3]
if kmer != 0 && (*counter)[p] == 1 {
if kmer != 0 && (*counter)[p] == 1 &&
!util.MustKmerHasSuffix(kmer, ttt, k8, lenPrefixSuffix) { // (gap AAAAAAAAAAAA) ACTGACTAG
kmerPos = uint64(start+_j)<<1 | 1
ok = true
break
Expand All @@ -1001,10 +1012,12 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,
maskList = lh.MaskKmer(kmer)

hashMin = math.MaxUint64
for _, _im = range *maskList { // find the mask which capture the most similar k-mer with current one
for _, _im = range *maskList {
// fmt.Printf(" to %s\n", kmers.MustDecode((*_kmers)[_im], k))
hash = kmer ^ (*_kmers)[_im]
// hash = kmer ^ lh.Masks[_im] // both are OK

// both are OK
hash = kmer ^ (*_kmers)[_im] // find the mask which capture the most similar k-mer with current one
// hash = kmer ^ lh.Masks[_im] // find the mask which will capture current one
if hash < hashMin {
_imMostSimilar = _im
}
Expand Down Expand Up @@ -1153,7 +1166,7 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,
nMasks := opt.Masks
chunkSize := (nMasks + chunks - 1) / opt.Chunks
var j, begin, end int

k8 := uint8(lh.K)
tokens = make(chan int, 1) // hope it reduces memory
for j = 0; j < chunks; j++ { // each chunk
begin = j * chunkSize
Expand All @@ -1176,7 +1189,7 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions,
// }
// }

_, err := kv.WriteKVData(uint8(k), begin, (*datas)[begin:end], file, opt.Partitions)
_, err := kv.WriteKVData(k8, begin, (*datas)[begin:end], file, opt.Partitions)
if err != nil {
checkError(fmt.Errorf("failed to write seeds data: %s", err))
}
Expand Down
4 changes: 2 additions & 2 deletions lexicmap/cmd/seed-pos.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ Extra columns:
continue
}

fmt.Fprintf(outfh, "%s\t%d\t%c\t%d", refname, pos+1, lexichash.Strands[pos2str&1], dist)
fmt.Fprintf(outfh, "%s\t%d\t%c\t%d", refname, pos+1, lexichash.Strands[pos2str>>1&1], dist)

if moreColumns {
if dist <= 0 {
Expand Down Expand Up @@ -411,7 +411,7 @@ Extra columns:

v = append(v, float64(dist))

fmt.Fprintf(outfh, "%s\t%d\t%c\t%d", refname, pos+1, lexichash.Strands[pos2str&1], dist)
fmt.Fprintf(outfh, "%s\t%d\t%c\t%d", refname, pos+1, lexichash.Strands[pos2str>>1&1], dist)

if moreColumns {
if dist <= 0 {
Expand Down
13 changes: 13 additions & 0 deletions lexicmap/cmd/util/kmers.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ func MustKmerHasPrefix(code uint64, prefix uint64, k1, k2 uint8) bool {
return code>>((k1-k2)<<1) == prefix
}

// KmerHasSuffix checks if a k-mer has a suffix.
func KmerHasSuffix(code uint64, suffix uint64, k1, k2 uint8) bool {
if k1 < k2 {
return false
}
return code&((1<<(k2<<1))-1) == suffix
}

// MustKmerHasSuffix checks if a k-mer has a suffix, by assuming k1>=k2.
func MustKmerHasSuffix(code uint64, suffix uint64, k1, k2 uint8) bool {
return code&((1<<(k2<<1))-1) == suffix
}

// SharingPrefixKmersMismatch counts the number of mismatch between two k-mers
// sharing with a p-bp prefix.
func SharingPrefixKmersMismatch(code1, code2 uint64, k, p uint8) (n uint8) {
Expand Down

0 comments on commit 6ebe6e2

Please sign in to comment.