diff --git a/.gitignore b/.gitignore index cfd99cf..8da8b1b 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ docs/themes demo/demo.lmi demo/*.lexicmap.tsv.gz +demo/seed_distance.tsv demo/seed_distance demo/seed-pos.tsv.gz demo/t.txt diff --git a/docs/content/usage/utils/seed-pos.md b/docs/content/usage/utils/seed-pos.md index ddf9456..65db302 100644 --- a/docs/content/usage/utils/seed-pos.md +++ b/docs/content/usage/utils/seed-pos.md @@ -75,21 +75,21 @@ Global Flags: ref pos strand distance --------------- --- ------ -------- GCF_000017205.1 2 + 1 - GCF_000017205.1 41 + 39 + GCF_000017205.1 41 - 39 GCF_000017205.1 45 + 4 - GCF_000017205.1 74 + 29 - GCF_000017205.1 85 + 11 - GCF_000017205.1 119 + 34 - GCF_000017205.1 130 + 11 + GCF_000017205.1 74 - 29 + GCF_000017205.1 85 - 11 + GCF_000017205.1 119 - 34 + GCF_000017205.1 130 - 11 GCF_000017205.1 185 + 55 - GCF_000017205.1 269 + 84 + GCF_000017205.1 269 - 84 Or only list records with seed distance longer than a threshold. $ lexicmap utils seed-pos -d demo.lmi/ -n GCF_000017205.1 -D 1000 | csvtk pretty -t ref pos strand distance --------------- ------- ------ -------- - GCF_000017205.1 5430137 + 1234 + GCF_000017205.1 5430137 - 1235 Check the biggest seed distances. @@ -100,7 +100,7 @@ Global Flags: distance frequency -------- --------- - 1234 1 + 1235 1 899 2 898 4 897 1 @@ -136,7 +136,7 @@ Global Flags: ref pos strand distance pre_pos len_aaa seq --------------- --- ------ -------- ------- ------- --------------------------------------- GCF_000017205.1 2 + 1 0 0 T - GCF_000017205.1 41 + 39 1 5 TAAAGAGACCGGCGATTCTAGTGAAATCGAACGGGCAGG + GCF_000017205.1 41 - 39 1 5 TAAAGAGACCGGCGATTCTAGTGAAATCGAACGGGCAGG GCF_000017205.1 45 + 4 40 1 TCAA Or only list records with seed distance longer than a threshold. @@ -145,7 +145,7 @@ Global Flags: | csvtk pretty -t -W 50 --clip ref pos strand distance pre_pos len_aaa seq --------------- ------- ------ -------- ------- ------- -------------------------------------------------- - GCF_000017205.1 5430137 + 1234 5428902 4 ATCGGCGATCACGTTCAGCAGCGCCTTGGTGATGGTCAGGTTGTTGC. + GCF_000017205.1 5430137 - 1235 5428901 4 CATCGGCGATCACGTTCAGCAGCGCCTTGGTGATGGTCAGGTTGTTG... 3. Listing seed position of all genomes. @@ -158,21 +158,21 @@ Global Flags: $ csvtk freq -t -f ref -nr seed-pos.tsv.gz | csvtk pretty -t ref frequency --------------- --------- - GCF_000017205.1 45708 + GCF_000017205.1 45711 GCF_002950215.1 43617 - GCF_002949675.1 43411 + GCF_002949675.1 43415 GCF_001457655.1 42112 - GCF_006742205.1 42103 + GCF_006742205.1 42102 GCF_900638025.1 42008 - GCF_001027105.1 41856 - GCF_000742135.1 41414 - GCF_000392875.1 41392 + GCF_001027105.1 41855 + GCF_000742135.1 41415 + GCF_000392875.1 41391 GCF_003697165.2 41189 - GCF_009759685.1 41137 - GCF_000006945.2 41108 + GCF_009759685.1 41138 + GCF_000006945.2 41110 GCF_000148585.2 41075 - GCF_001096185.1 40239 - GCF_001544255.1 40159 + GCF_001096185.1 40226 + GCF_001544255.1 40153 Plot the histograms of distances between seeds for all genomes. diff --git a/go.mod b/go.mod index 718653b..193a5cc 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/shenwei356/bio v0.13.3 github.com/shenwei356/go-logging v0.0.0-20171012171522-c6b9702d88ba github.com/shenwei356/kmers v0.1.0 - github.com/shenwei356/lexichash v0.3.0 + github.com/shenwei356/lexichash v0.4.0 github.com/shenwei356/util v0.5.2 github.com/spf13/cobra v1.8.0 github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553 diff --git a/lexicmap/cmd/lib-index-build.go b/lexicmap/cmd/lib-index-build.go index dfb38e0..9107b5a 100644 --- a/lexicmap/cmd/lib-index-build.go +++ b/lexicmap/cmd/lib-index-build.go @@ -37,6 +37,7 @@ import ( "github.com/shenwei356/LexicMap/lexicmap/cmd/genome" "github.com/shenwei356/LexicMap/lexicmap/cmd/kv" "github.com/shenwei356/LexicMap/lexicmap/cmd/seedposition" + "github.com/shenwei356/LexicMap/lexicmap/cmd/util" "github.com/shenwei356/bio/seqio/fastx" "github.com/shenwei356/lexichash" "github.com/shenwei356/lexichash/iterator" @@ -600,7 +601,6 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, // -------------------------------- // 1) parsing input genome files & mask & pack sequences - k := lh.K nnn := bytes.Repeat([]byte{'N'}, opt.ContigInterval) reRefName := opt.ReRefName extractRefName := reRefName != nil @@ -620,6 +620,9 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, }() startTime := time.Now() + k := lh.K + k8 := uint8(lh.K) + // -------------------------------- // read sequence @@ -821,6 +824,9 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, var hash, hashMin uint64 var knl *[]uint64 + var lenPrefixSuffix uint8 = 1 + ttt := uint64((1 << (lenPrefixSuffix << 1)) - 1) + extraLocs := poolInts.Get().(*[]int) // extra positions *extraLocs = (*extraLocs)[:0] @@ -835,6 +841,12 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, continue } + // there's a really big gap in it, it might be the interver between contigs or a assembly gap + if float64(lengthAAs(refseq.Seq[pre:pos]))/float64(d) >= 0.7 { + pre = pos + continue + } + // range of desert region +- 1000 bp, // as we don't want other kmers with the same prefix exist around. start = int(pre) - 1000 // start position in the sequence @@ -896,11 +908,6 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, *p2ks = append(*p2ks, p2k) } - if (*counter)[0] > 50 { // it's in an interval region - pre = pos - continue - } - // fmt.Printf(" list sizeļ¼š %d\n", len(*p2ks)) // fmt.Printf(" posOfPre: %d, posOfCur: %d\n", posOfPre, posOfCur) @@ -920,12 +927,13 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, ok = false for ; _j > _end; _j-- { - // fmt.Printf(" test %d\n", _j) + // fmt.Printf(" test u %d\n", _j) p2k = (*p2ks)[_j] // strand + p, kmer = (*p2k)[0], (*p2k)[1] - if kmer != 0 && (*counter)[p] == 1 { + if kmer != 0 && (*counter)[p] == 1 && + !util.MustKmerHasPrefix(kmer, 0, k8, lenPrefixSuffix) { // (gap AAAAAAAAAAAA) ACTGACTAG kmerPos = uint64(start+_j) << 1 ok = true break @@ -933,7 +941,8 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, // strand - p, kmer = (*p2k)[2], (*p2k)[3] - if kmer != 0 && (*counter)[p] == 1 { + if kmer != 0 && (*counter)[p] == 1 && + !util.MustKmerHasSuffix(kmer, ttt, k8, lenPrefixSuffix) { // (gap AAAAAAAAAAAA) ACTGACTAG kmerPos = uint64(start+_j)<<1 | 1 ok = true break @@ -978,11 +987,12 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, _end = posOfCur - 1 } for _j = _start; _j < _end; _j++ { + // fmt.Printf(" test d %d\n", _j) p2k = (*p2ks)[_j] - // strand + p, kmer = (*p2k)[0], (*p2k)[1] - if kmer != 0 && (*counter)[p] == 1 { + if kmer != 0 && (*counter)[p] == 1 && + !util.MustKmerHasPrefix(kmer, 0, k8, lenPrefixSuffix) { // (gap AAAAAAAAAAAA) ACTGACTAG kmerPos = uint64(start+_j) << 1 ok = true break @@ -990,7 +1000,8 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, // strand - p, kmer = (*p2k)[2], (*p2k)[3] - if kmer != 0 && (*counter)[p] == 1 { + if kmer != 0 && (*counter)[p] == 1 && + !util.MustKmerHasSuffix(kmer, ttt, k8, lenPrefixSuffix) { // (gap AAAAAAAAAAAA) ACTGACTAG kmerPos = uint64(start+_j)<<1 | 1 ok = true break @@ -1001,10 +1012,12 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, maskList = lh.MaskKmer(kmer) hashMin = math.MaxUint64 - for _, _im = range *maskList { // find the mask which capture the most similar k-mer with current one + for _, _im = range *maskList { // fmt.Printf(" to %s\n", kmers.MustDecode((*_kmers)[_im], k)) - hash = kmer ^ (*_kmers)[_im] - // hash = kmer ^ lh.Masks[_im] // both are OK + + // both are OK + hash = kmer ^ (*_kmers)[_im] // find the mask which capture the most similar k-mer with current one + // hash = kmer ^ lh.Masks[_im] // find the mask which will capture current one if hash < hashMin { _imMostSimilar = _im } @@ -1153,7 +1166,7 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, nMasks := opt.Masks chunkSize := (nMasks + chunks - 1) / opt.Chunks var j, begin, end int - + k8 := uint8(lh.K) tokens = make(chan int, 1) // hope it reduces memory for j = 0; j < chunks; j++ { // each chunk begin = j * chunkSize @@ -1176,7 +1189,7 @@ func buildAnIndex(lh *lexichash.LexicHash, opt *IndexBuildingOptions, // } // } - _, err := kv.WriteKVData(uint8(k), begin, (*datas)[begin:end], file, opt.Partitions) + _, err := kv.WriteKVData(k8, begin, (*datas)[begin:end], file, opt.Partitions) if err != nil { checkError(fmt.Errorf("failed to write seeds data: %s", err)) } diff --git a/lexicmap/cmd/seed-pos.go b/lexicmap/cmd/seed-pos.go index 4486271..f0896b3 100644 --- a/lexicmap/cmd/seed-pos.go +++ b/lexicmap/cmd/seed-pos.go @@ -348,7 +348,7 @@ Extra columns: continue } - fmt.Fprintf(outfh, "%s\t%d\t%c\t%d", refname, pos+1, lexichash.Strands[pos2str&1], dist) + fmt.Fprintf(outfh, "%s\t%d\t%c\t%d", refname, pos+1, lexichash.Strands[pos2str>>1&1], dist) if moreColumns { if dist <= 0 { @@ -411,7 +411,7 @@ Extra columns: v = append(v, float64(dist)) - fmt.Fprintf(outfh, "%s\t%d\t%c\t%d", refname, pos+1, lexichash.Strands[pos2str&1], dist) + fmt.Fprintf(outfh, "%s\t%d\t%c\t%d", refname, pos+1, lexichash.Strands[pos2str>>1&1], dist) if moreColumns { if dist <= 0 { diff --git a/lexicmap/cmd/util/kmers.go b/lexicmap/cmd/util/kmers.go index cc28885..426b4cd 100644 --- a/lexicmap/cmd/util/kmers.go +++ b/lexicmap/cmd/util/kmers.go @@ -72,6 +72,19 @@ func MustKmerHasPrefix(code uint64, prefix uint64, k1, k2 uint8) bool { return code>>((k1-k2)<<1) == prefix } +// KmerHasSuffix checks if a k-mer has a suffix. +func KmerHasSuffix(code uint64, suffix uint64, k1, k2 uint8) bool { + if k1 < k2 { + return false + } + return code&((1<<(k2<<1))-1) == suffix +} + +// MustKmerHasSuffix checks if a k-mer has a suffix, by assuming k1>=k2. +func MustKmerHasSuffix(code uint64, suffix uint64, k1, k2 uint8) bool { + return code&((1<<(k2<<1))-1) == suffix +} + // SharingPrefixKmersMismatch counts the number of mismatch between two k-mers // sharing with a p-bp prefix. func SharingPrefixKmersMismatch(code1, code2 uint64, k, p uint8) (n uint8) {