From e95172a1d505203d8ec0f5d18b65e3e29e5a5bbe Mon Sep 17 00:00:00 2001 From: "Sabrina J. Ward" Date: Sun, 18 Sep 2022 18:52:02 +0100 Subject: [PATCH 1/2] Better conversion from 2 to 4 bit nucleic kmers --- src/constructors.jl | 328 +++++++++++++++++++++++++++++++++++++++ src/kmer.jl | 325 +++----------------------------------- src/revtrans.jl | 1 - src/tuple_bitflipping.jl | 74 ++++++++- 4 files changed, 425 insertions(+), 303 deletions(-) create mode 100644 src/constructors.jl diff --git a/src/constructors.jl b/src/constructors.jl new file mode 100644 index 0000000..4b6ea56 --- /dev/null +++ b/src/constructors.jl @@ -0,0 +1,328 @@ +### +### Constructors for Kmer types +### + +#= +These are (hopefully!) very optimised kernel functions for building kmer internal +data from individual elements or from sequences. Kmers themselves are static, +tuple-based structs, and so I really didn't want these functions to create memory +allocations or GC activity through use of vectors an such, for what should be +the creation of a single, rather simple value. +=# + +""" + _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} + +Construct a ntuple of the bits data for an instance of a Kmer{A,K,N}. + +This particular method is specialised for LongSequences, and for when the Kmer +and LongSequence types used, share the same alphabet, since a lot of encoding / +decoding can be skipped, and the problem is mostly one of shunting bits around. +""" +@inline function _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} + checkmer(Kmer{A,K,N}) + + bits_per_sym = BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. + n_head = elements_in_head(Kmer{A,K,N}) # Based on kmer type, should constant fold. + n_per_chunk = per_word_capacity(Kmer{A,K,N}) # Based on kmer type, should constant fold. + + if from + K - 1 > length(seq) + return nothing + end + + # Construct the head. + head = zero(UInt64) + @inbounds for i in from:(from + n_head - 1) + bits = UInt64(BioSequences.extract_encoded_element(seq, i)) + head = (head << bits_per_sym) | bits + end + + # And the rest of the sequence + idx = Ref(from + n_head) + tail = ntuple(Val{N - 1}()) do i + Base.@_inline_meta + body = zero(UInt64) + @inbounds for _ in 1:n_per_chunk + bits = UInt64(BioSequences.extract_encoded_element(seq, idx[])) + body = (body << bits_per_sym) | bits + idx[] += 1 + end + return body + end + + # Put head and tail together + return (head, tail...) +end + +""" + Kmer{A,K,N}(itr) where {A,K,N} + +Construct a `Kmer{A,K,N}` from an iterable. + +The most generic constructor. + +Currently the iterable must have `length` & support `getindex` with integers. + +# Examples + +```jldoctest +julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet +5nt DNA Sequence: +TTAGC + +julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet +DNA 5-mer: +TTAGC +``` +""" +function Kmer{A,K,N}(itr) where {A,K,N} + checkmer(Kmer{A,K,N}) + + seqlen = length(itr) + if seqlen != K + throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)")) + end + + ## All based on alphabet type of Kmer, so should constant fold. + bits_per_sym = BioSequences.bits_per_symbol(A()) + n_head = elements_in_head(Kmer{A,K,N}) + n_per_chunk = per_word_capacity(Kmer{A,K,N}) + + # Construct the head. + head = zero(UInt64) + @inbounds for i in 1:n_head + sym = convert(eltype(Kmer{A,K,N}), itr[i]) + # Encode will throw if it cant encode an element. + head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) + end + + # And the rest of the sequence + idx = Ref(n_head + 1) + tail = ntuple(Val{N - 1}()) do i + Base.@_inline_meta + body = zero(UInt64) + @inbounds for i in 1:n_per_chunk + sym = convert(eltype(Kmer{A,K,N}), itr[idx[]]) + # Encode will throw if it cant encode an element. + body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) + idx[] += 1 + end + return body + end + + data = (head, tail...) + + return Kmer{A,K,N}(data) +end + +""" + Kmer{A,K,N}(seq::BioSequence{A}) + +Construct a `Kmer{A,K,N}` from a `BioSequence{A}`. + +This particular method is specialised for BioSequences, and for when the Kmer +and BioSequence types used, share the same alphabet, since a lot of encoding / +decoding can be skipped, and the problem is mostly one of shunting bits around. +In the case where the alphabet of the Kmer and the alphabet of the BioSequence +differ, dispatch to the more generic constructor occurs instead. + +# Examples + +```jldoctest +julia> ntseq = LongSequence{DNAAlphabet{2}}("TTAGC") # 2-bit DNA alphabet +5nt DNA Sequence: +TTAGC + +julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet +DNA 5-mer: +TTAGC +``` +""" +@inline function Kmer{A,K,N}(seq::BioSequence{A}) where {A,K,N} + checkmer(Kmer{A,K,N}) + + seqlen = length(seq) + if seqlen != K + throw(ArgumentError("seq is not the correct length ($seqlen ≠ $K)")) + end + + ## All based on alphabet type of Kmer, so should constant fold. + bits_per_sym = BioSequences.bits_per_symbol(A()) + n_head = elements_in_head(Kmer{A,K,N}) + n_per_chunk = per_word_capacity(Kmer{A,K,N}) + + # Construct the head. + head = zero(UInt64) + @inbounds for i in 1:n_head + bits = UInt64(BioSequences.extract_encoded_element(seq, i)) + head = (head << bits_per_sym) | bits + end + + # And the rest of the sequence + idx = Ref(n_head + 1) + tail = ntuple(Val{N - 1}()) do i + Base.@_inline_meta + body = zero(UInt64) + @inbounds for _ in 1:n_per_chunk + bits = UInt64(BioSequences.extract_encoded_element(seq, idx[])) + body = (body << bits_per_sym) | bits + idx[] += 1 + end + return body + end + + data = (head, tail...) + + return Kmer{A,K,N}(data) +end + + +""" + Kmer{A,K,n}(x::Kmer{B,K,N}) where {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N,n} + +A more optimal method of constructing four-bit encoded nucleic +acid kmers, from two-bit encoded nucleic acid kmers. +""" +@inline function Kmer{A,K,n}(x::Kmer{B,K,N}) where + {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N,n} + checkmer(Kmer{A,K,n}) + newbits = transcode_bits(x.data, B(), A()) + clippedbits = _drophead(newbits, Val{length(newbits) - n}()) + return Kmer{A,K,n}(clippedbits) +end + + +""" + Kmer{A,K}(x::Kmer{B,K,N}) where {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N} + +A more optimal method of constructing four-bit encoded nucleic +acid kmers, from two-bit encoded nucleic acid kmers. + +Works out appropriate N for the output type as a convenience. +""" +@inline function Kmer{A,K}(x::Kmer{B,K,N}) where + {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N} + return kmertype(Kmer{A,K})(x) +end + + +""" + Kmer{A}(x::Kmer{B,K,N}) where {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N} + +A more optimal method of constructing four-bit encoded nucleic +acid kmers, from two-bit encoded nucleic acid kmers. + +Works out appropriate K and N for output type as a convenience. +""" +@inline function Kmer{A}(x::Kmer{B,K,N}) where + {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N} + return Kmer{A,K}(x) +end + + + + + + + + +# Convenience version of function above so you don't have to work out correct N. +""" + Kmer{A,K}(itr) where {A,K} + +Construct a `Kmer{A,K,N}` from an iterable. + +This is a convenience method which will work out the correct `N` parameter, for +your given choice of `A` & `K`. +""" +@inline function Kmer{A,K}(itr) where {A,K} + T = kmertype(Kmer{A,K}) + return T(itr) +end + +""" + Kmer{A}(itr) where {A} + +Construct a `Kmer{A,K,N}` from an iterable. + +This is a convenience method which will work out K from the length of `itr`, and +the correct `N` parameter, for your given choice of `A` & `K`. + +!!! warning + Since this gets K from runtime values, this is gonna be slow! +""" +@inline Kmer{A}(itr) where {A} = Kmer{A,length(itr)}(itr) +@inline Kmer(seq::BioSequence{A}) where A = Kmer{A}(seq) + +function Kmer{A1}(seq::BioSequence{A2}) where {A1 <: NucleicAcidAlphabet, A2 <: NucleicAcidAlphabet} + kmertype(Kmer{A1, length(seq)})(seq) +end + +@inline function Kmer{A}(nts::Vararg{Union{DNA, RNA}, K}) where {A <: NucleicAcidAlphabet, K} + return kmertype(Kmer{A, K})(nts) +end + +""" + Kmer(nts::Vararg{DNA,K}) where {K} + +Construct a Kmer from a variable number `K` of DNA nucleotides. + +# Examples + +```jldoctest +julia> Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C) +DNA 5-mer: +TTAGC +``` +""" +@inline Kmer(nt::DNA, nts::Vararg{DNA}) = DNAKmer((nt, nts...)) + +""" + Kmer(nts::Vararg{RNA,K}) where {K} + +Construct a Kmer from a variable number `K` of RNA nucleotides. + +# Examples + +```jldoctest +julia> Kmer(RNA_U, RNA_U, RNA_A, RNA_G, RNA_C) +DNA 5-mer: +UUAGC +``` +""" +@inline Kmer(nt::RNA, nts::Vararg{RNA}) = RNAKmer((nt, nts...)) + + +""" + Kmer(seq::String) + +Construct a DNA or RNA kmer from a string. + +!!! warning + As a convenience method, this derives the `K`, `Alphabet`, and `N` parameters + for the `Kmer{A,K,N}` type from the input string. + +# Examples + +```jldoctest +julia> Kmer("TTAGC") +DNA 5-mer: +TTAGC +``` +""" +@inline function Kmer(seq::String) + seq′ = BioSequences.remove_newlines(seq) + hast = false + hasu = false + for c in seq′ + hast |= ((c == 'T') | (c == 't')) + hasu |= ((c == 'U') | (c == 'u')) + end + if (hast & hasu) | (!hast & !hasu) + throw(ArgumentError("Can't detect alphabet type from string")) + end + A = ifelse(hast & !hasu, DNAAlphabet{2}, RNAAlphabet{2}) + return Kmer{A,length(seq′)}(seq′) +end + + diff --git a/src/kmer.jl b/src/kmer.jl index 62f108c..a66711a 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -57,289 +57,47 @@ BioSequences.encoded_data(seq::Kmer{A,K,N}) where {A,K,N} = seq.data # Create a blank ntuple of appropriate length for a given Kmer with N. @inline blank_ntuple(::Type{Kmer{A,K,N}}) where {A,K,N} = ntuple(x -> zero(UInt64), Val{N}()) -### -### _build_kmer_data -### - -#= -These are (hopefully!) very optimised kernel functions for building kmer internal -data from individual elements or from sequences. Kmers themselves are static, -tuple-based structs, and so I really didn't want these functions to create memory -allocations or GC activity through use of vectors an such, for what should be -the creation of a single, rather simple value. -=# - -""" - _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} - -Construct a ntuple of the bits data for an instance of a Kmer{A,K,N}. - -This particular method is specialised for LongSequences, and for when the Kmer -and LongSequence types used, share the same alphabet, since a lot of encoding / -decoding can be skipped, and the problem is mostly one of shunting bits around. -""" -@inline function _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} - checkmer(Kmer{A,K,N}) - - bits_per_sym = BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - n_head = elements_in_head(Kmer{A,K,N}) # Based on kmer type, should constant fold. - n_per_chunk = per_word_capacity(Kmer{A,K,N}) # Based on kmer type, should constant fold. - - if from + K - 1 > length(seq) - return nothing - end - - # Construct the head. - head = zero(UInt64) - @inbounds for i in from:(from + n_head - 1) - bits = UInt64(BioSequences.extract_encoded_element(seq, i)) - head = (head << bits_per_sym) | bits - end - - # And the rest of the sequence - idx = Ref(from + n_head) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for _ in 1:n_per_chunk - bits = UInt64(BioSequences.extract_encoded_element(seq, idx[])) - body = (body << bits_per_sym) | bits - idx[] += 1 - end - return body - end - - # Put head and tail together - return (head, tail...) -end - - - -### -### Constructors -### - -""" - Kmer{A,K,N}(itr) where {A,K,N} - -Construct a `Kmer{A,K,N}` from an iterable. - -The most generic constructor. - -Currently the iterable must have `length` & support `getindex` with integers. - -# Examples - -```jldoctest -julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet -5nt DNA Sequence: -TTAGC - -julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet -DNA 5-mer: -TTAGC -``` -""" -function Kmer{A,K,N}(itr) where {A,K,N} - checkmer(Kmer{A,K,N}) - - seqlen = length(itr) - if seqlen != K - throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)")) - end - - ## All based on alphabet type of Kmer, so should constant fold. - bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A,K,N}) - n_per_chunk = per_word_capacity(Kmer{A,K,N}) - - # Construct the head. - head = zero(UInt64) - @inbounds for i in 1:n_head - sym = convert(eltype(Kmer{A,K,N}), itr[i]) - # Encode will throw if it cant encode an element. - head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) - end - - # And the rest of the sequence - idx = Ref(n_head + 1) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for i in 1:n_per_chunk - sym = convert(eltype(Kmer{A,K,N}), itr[idx[]]) - # Encode will throw if it cant encode an element. - body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) - idx[] += 1 - end - return body - end - - data = (head, tail...) - - return Kmer{A,K,N}(data) -end - -""" - Kmer{A,K,N}(seq::BioSequence{A}) - -Construct a `Kmer{A,K,N}` from a `BioSequence{A}`. - -This particular method is specialised for BioSequences, and for when the Kmer -and BioSequence types used, share the same alphabet, since a lot of encoding / -decoding can be skipped, and the problem is mostly one of shunting bits around. -In the case where the alphabet of the Kmer and the alphabet of the BioSequence -differ, dispatch to the more generic constructor occurs instead. - -# Examples - -```jldoctest -julia> ntseq = LongSequence{DNAAlphabet{2}}("TTAGC") # 2-bit DNA alphabet -5nt DNA Sequence: -TTAGC - -julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet -DNA 5-mer: -TTAGC -``` -""" -@inline function Kmer{A,K,N}(seq::BioSequence{A}) where {A,K,N} - checkmer(Kmer{A,K,N}) - - seqlen = length(seq) - if seqlen != K - throw(ArgumentError("seq is not the correct length ($seqlen ≠ $K)")) - end - - ## All based on alphabet type of Kmer, so should constant fold. - bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A,K,N}) - n_per_chunk = per_word_capacity(Kmer{A,K,N}) - - # Construct the head. - head = zero(UInt64) - @inbounds for i in 1:n_head - bits = UInt64(BioSequences.extract_encoded_element(seq, i)) - head = (head << bits_per_sym) | bits - end - - # And the rest of the sequence - idx = Ref(n_head + 1) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for _ in 1:n_per_chunk - bits = UInt64(BioSequences.extract_encoded_element(seq, idx[])) - body = (body << bits_per_sym) | bits - idx[] += 1 - end - return body - end - - data = (head, tail...) - - return Kmer{A,K,N}(data) -end - - -# Convenience version of function above so you don't have to work out correct N. -""" - Kmer{A,K}(itr) where {A,K} - -Construct a `Kmer{A,K,N}` from an iterable. - -This is a convenience method which will work out the correct `N` parameter, for -your given choice of `A` & `K`. -""" -@inline function Kmer{A,K}(itr) where {A,K} - T = kmertype(Kmer{A,K}) - return T(itr) -end - -""" - Kmer{A}(itr) where {A} - -Construct a `Kmer{A,K,N}` from an iterable. - -This is a convenience method which will work out K from the length of `itr`, and -the correct `N` parameter, for your given choice of `A` & `K`. +# Aliases +"Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`" +const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N} -!!! warning - Since this gets K from runtime values, this is gonna be slow! -""" -@inline Kmer{A}(itr) where {A} = Kmer{A,length(itr)}(itr) -@inline Kmer(seq::BioSequence{A}) where A = Kmer{A}(seq) +"Shortcut for the type `DNAKmer{27,1}`" +const DNA27mer = DNAKmer{27,1} -function Kmer{A1}(seq::BioSequence{A2}) where {A1 <: NucleicAcidAlphabet, A2 <: NucleicAcidAlphabet} - kmertype(Kmer{A1, length(seq)})(seq) -end +"Shortcut for the type `DNAKmer{31,1}`" +const DNA31mer = DNAKmer{31,1} -@inline function Kmer{A}(nts::Vararg{Union{DNA, RNA}, K}) where {A <: NucleicAcidAlphabet, K} - return kmertype(Kmer{A, K})(nts) -end +"Shortcut for the type `DNAKmer{63,2}`" +const DNA63mer = DNAKmer{63,2} -""" - Kmer(nts::Vararg{DNA,K}) where {K} +"Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`" +const RNAKmer{K,N} = Kmer{RNAAlphabet{2},K,N} -Construct a Kmer from a variable number `K` of DNA nucleotides. +"Shortcut for the type `RNAKmer{27,1}`" +const RNA27mer = RNAKmer{27,1} -# Examples +"Shortcut for the type `RNAKmer{31,1}`" +const RNA31mer = RNAKmer{31,1} -```jldoctest -julia> Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C) -DNA 5-mer: -TTAGC -``` -""" -@inline Kmer(nt::DNA, nts::Vararg{DNA}) = DNAKmer((nt, nts...)) +"Shortcut for the type `RNAKmer{63,2}`" +const RNA63mer = RNAKmer{63,2} -""" - Kmer(nts::Vararg{RNA,K}) where {K} +"Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`" +const AAKmer{K,N} = Kmer{AminoAcidAlphabet,K,N} -Construct a Kmer from a variable number `K` of RNA nucleotides. +"Shorthand for `DNAKmer{3,1}`" +const DNACodon = DNAKmer{3,1} -# Examples +"Shorthand for `RNAKmer{3,1}`" +const RNACodon = RNAKmer{3,1} -```jldoctest -julia> Kmer(RNA_U, RNA_U, RNA_A, RNA_G, RNA_C) -DNA 5-mer: -UUAGC -``` -""" -@inline Kmer(nt::RNA, nts::Vararg{RNA}) = RNAKmer((nt, nts...)) -""" - Kmer(seq::String) -Construct a DNA or RNA kmer from a string. -!!! warning - As a convenience method, this derives the `K`, `Alphabet`, and `N` parameters - for the `Kmer{A,K,N}` type from the input string. +include("constructors.jl") -# Examples -```jldoctest -julia> Kmer("TTAGC") -DNA 5-mer: -TTAGC -``` -""" -@inline function Kmer(seq::String) - seq′ = BioSequences.remove_newlines(seq) - hast = false - hasu = false - for c in seq′ - hast |= ((c == 'T') | (c == 't')) - hasu |= ((c == 'U') | (c == 'u')) - end - if (hast & hasu) | (!hast & !hasu) - throw(ArgumentError("Can't detect alphabet type from string")) - end - A = ifelse(hast & !hasu, DNAAlphabet{2}, RNAAlphabet{2}) - return Kmer{A,length(seq′)}(seq′) -end """ @@ -359,41 +117,6 @@ Kmer{DNAAlphabet{2},63,2} end @inline kmertype(::Type{Kmer{A,K,N}}) where {A,K,N} = Kmer{A,K,N} -# Aliases -"Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`" -const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N} - -"Shortcut for the type `DNAKmer{27,1}`" -const DNA27mer = DNAKmer{27,1} - -"Shortcut for the type `DNAKmer{31,1}`" -const DNA31mer = DNAKmer{31,1} - -"Shortcut for the type `DNAKmer{63,2}`" -const DNA63mer = DNAKmer{63,2} - -"Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`" -const RNAKmer{K,N} = Kmer{RNAAlphabet{2},K,N} - -"Shortcut for the type `RNAKmer{27,1}`" -const RNA27mer = RNAKmer{27,1} - -"Shortcut for the type `RNAKmer{31,1}`" -const RNA31mer = RNAKmer{31,1} - -"Shortcut for the type `RNAKmer{63,2}`" -const RNA63mer = RNAKmer{63,2} - -"Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`" -const AAKmer{K,N} = Kmer{AminoAcidAlphabet,K,N} - -"Shorthand for `DNAKmer{3,1}`" -const DNACodon = DNAKmer{3,1} - -"Shorthand for `RNAKmer{3,1}`" -const RNACodon = RNAKmer{3,1} - - ### ### Base Functions ### diff --git a/src/revtrans.jl b/src/revtrans.jl index 4e8dc2e..fdf7db9 100644 --- a/src/revtrans.jl +++ b/src/revtrans.jl @@ -1,4 +1,3 @@ -struct Unsafe end const N_AA = length(AminoAcidAlphabet()) diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index 4476956..a851467 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -25,6 +25,25 @@ Notably it's used when constructing a Kmer from an existing NTuple of UInt64 return (head & (typemax(UInt64) >> by), tail...) end + +""" + _drophead(x::NTuple{N,UInt64}, drop::Val{D}) where {N,D} + +A function used to drop the first N elements from an NTuple. + +Intended for internal bit-flipping use. + +Notably used in cases where expanding a 2-bit encoding to +a 4-bit encoding results in an unused element at the head +of the tuple. +""" +@inline function _drophead(x::NTuple{N,UInt64}, drop::Val{D}) where {N,D} + ntuple(Val{N - D}()) do i + Base.@_inline_meta + return @inbounds x[D + i] + end +end + #= rightshift_carry & leftshift_carry @@ -69,4 +88,57 @@ end end @inline _reverse(f::F, ::BioSequences.BitsPerSymbol{N}) where {N,F<:Function} = () -=# \ No newline at end of file +=# + +const transcode_2_to_4_nucs = (function () + + @info "Computing 2 -> 4 bit translation table..." + + fourbitnucs = (UInt32(1), UInt32(2), UInt32(4), UInt32(8)) + + fn(bits, lshift) = fourbitnucs[((bits >> lshift) & 0x03) + 1] << (2 * lshift) + + function translate2to4bits(bits::UInt16) + fn(bits, 0) | + fn(bits, 2) | + fn(bits, 4) | + fn(bits, 6) | + fn(bits, 8) | + fn(bits, 10) | + fn(bits, 12) | + fn(bits, 14) + end + + ntuple((x) -> translate2to4bits(UInt16(x - 1)), 65536) +end)() + +@inline function transcode_kernel(bits::UInt64, from::A, to::B) where + {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4}} + + @inbounds begin + b = UInt64(transcode_2_to_4_nucs[((bits >> 0) & 0xFFFF) + 1]) << 0 | + UInt64(transcode_2_to_4_nucs[((bits >> 16) & 0xFFFF) + 1]) << 32 + + a = UInt64(transcode_2_to_4_nucs[((bits >> 32) & 0xFFFF) + 1]) << 0 | + UInt64(transcode_2_to_4_nucs[((bits >> 48) & 0xFFFF) + 1]) << 32 + end + return (a, b) +end + +@inline function transcode_bits(x::NTuple{N,UInt64}, from::A, to::B) where + {N,A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4}} + + return _transcode_bits(from, to, x...) +end + +@inline function _transcode_bits(from::A, to::B, head::UInt64, tail...) where + {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4}} + + return (transcode_kernel(head, from, to)..., _transcode_bits(from, to, tail...)...) +end + +@inline function _transcode_bits(from::A, to::B, head::UInt64) where + {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4}} + + return transcode_kernel(head, from, to) +end From dc1912da46ac5710e1aa74d83632f8dbecba9afe Mon Sep 17 00:00:00 2001 From: "Sabrina J. Ward" Date: Sun, 9 Apr 2023 01:53:05 +0100 Subject: [PATCH 2/2] I dunno some stuff I guess --- src/constructors.jl | 117 +++++++++++++++++++++++++++++++++----------- src/kmer.jl | 3 -- test/find.jl | 8 +-- 3 files changed, 92 insertions(+), 36 deletions(-) diff --git a/src/constructors.jl b/src/constructors.jl index 4b6ea56..561fff7 100644 --- a/src/constructors.jl +++ b/src/constructors.jl @@ -54,56 +54,85 @@ decoding can be skipped, and the problem is mostly one of shunting bits around. return (head, tail...) end -""" - Kmer{A,K,N}(itr) where {A,K,N} +function _construct_from_iterable(::Type{Kmer{A,K,N}}, iter, isize::Base.SizeUnknown) where {A,K,N} + ## All based on alphabet type of Kmer, so should constant fold. + checkmer(Kmer{A,K,N}) + ET = eltype(Kmer{A,K,N}) + bits_per_sym = BioSequences.bits_per_symbol(A()) + n_head = elements_in_head(Kmer{A,K,N}) + n_per_chunk = per_word_capacity(Kmer{A,K,N}) -Construct a `Kmer{A,K,N}` from an iterable. + stateful = Iterators.Stateful(iter) + head_filled = 0 -The most generic constructor. + # Construct the head. + head = zero(UInt64) + next = Ref{Union{Tuple{ET,Nothing},Nothing}}(iterate(stateful)) + while next[] !== nothing && head_filled < n_head + el, _ = next[] + sym = convert(ET, el) + # Encode will throw if it cant encode an element. + head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) + head_filled += 1 + next[] = iterate(stateful) + end -Currently the iterable must have `length` & support `getindex` with integers. + if head_filled != n_head + error("Iterator did not yield enough elements to construct $K-mer") + end -# Examples + tail = ntuple(Val{N - 1}()) do i + Base.@_inline_meta + chunk_filled = 0 + body = zero(UInt64) + while next[] !== nothing && chunk_filled < n_per_chunk + el, _ = next[] + sym = convert(ET, el) + # Encode will throw if it cant encode an element. + body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) + chunk_filled += 1 + next[] = iterate(stateful) + end -```jldoctest -julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet -5nt DNA Sequence: -TTAGC + if chunk_filled != n_per_chunk + error("Iterator did not yield enough elements to construct $K-mer") + end -julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet -DNA 5-mer: -TTAGC -``` -""" -function Kmer{A,K,N}(itr) where {A,K,N} - checkmer(Kmer{A,K,N}) - - seqlen = length(itr) - if seqlen != K - throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)")) + return body end + data = (head, tail...) + + return Kmer{A,K,N}(data) +end + +function _construct_from_iterable(::Type{Kmer{A,K,N}}, iter, isize::Base.HasLength) where {A,K,N} ## All based on alphabet type of Kmer, so should constant fold. + checkmer(Kmer{A,K,N}) + ET = eltype(Kmer{A,K,N}) bits_per_sym = BioSequences.bits_per_symbol(A()) n_head = elements_in_head(Kmer{A,K,N}) n_per_chunk = per_word_capacity(Kmer{A,K,N}) + + ilen = length(iter) + if ilen != K + throw(ArgumentError("seq is not the correct length ($ilen ≠ $K)")) + end # Construct the head. head = zero(UInt64) - @inbounds for i in 1:n_head - sym = convert(eltype(Kmer{A,K,N}), itr[i]) - # Encode will throw if it cant encode an element. + @inbounds for i in firstindex(iter):(firstindex(iter) + n_head - 1) + sym = convert(ET, iter[i]) head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) end # And the rest of the sequence - idx = Ref(n_head + 1) - tail = ntuple(Val{N - 1}()) do i + idx = Ref(firstindex(iter) + n_head) + tail = ntuple(Val{N - 1}()) do _ Base.@_inline_meta body = zero(UInt64) - @inbounds for i in 1:n_per_chunk - sym = convert(eltype(Kmer{A,K,N}), itr[idx[]]) - # Encode will throw if it cant encode an element. + @inbounds for _ in 1:n_per_chunk + sym = convert(ET, iter[i]) body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) idx[] += 1 end @@ -115,6 +144,36 @@ function Kmer{A,K,N}(itr) where {A,K,N} return Kmer{A,K,N}(data) end + +""" + Kmer{A,K,N}(itr) where {A,K,N} + +Construct a `Kmer{A,K,N}` from an iterable. + +The most generic constructor of a Kmer from any iterable type +that implements the bare minimum Iterator interface. + +# Examples + +```jldoctest +julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet +5nt DNA Sequence: +TTAGC + +julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet +DNA 5-mer: +TTAGC +``` +""" +function Kmer{A,K,N}(itr) where {A,K,N} + return _construct_from_iterable(Kmer{A,K,N}, itr, Base.IteratorSize(typeof(itr))) +end + + + + + + """ Kmer{A,K,N}(seq::BioSequence{A}) diff --git a/src/kmer.jl b/src/kmer.jl index a66711a..ddc09ba 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -146,9 +146,6 @@ checked at compile time, and the branches / error throws eliminated when the parameterisation of the Kmer type is good. """ @inline function checkmer(::Type{Kmer{A,K,N}}) where {A,K,N} - if K < 1 - throw(ArgumentError("Bad kmer parameterisation. K must be greater than 0.")) - end n = BioSequences.seq_data_len(A, K) if n !== N # This has been significantly changed conceptually from before. Now we diff --git a/test/find.jl b/test/find.jl index c3d4aaa..f258ab0 100644 --- a/test/find.jl +++ b/test/find.jl @@ -4,7 +4,7 @@ @test findnext(DNA_A, kmer, 1) == 1 @test findnext(DNA_C, kmer, 1) == 2 @test findnext(DNA_G, kmer, 1) == 3 - @test findnext(DNA_T, kmer, 1) == nothing + @test findnext(DNA_T, kmer, 1) === nothing @test findnext(DNA_A, kmer, 2) == 4 @test_throws BoundsError findnext(DNA_A, kmer, 0) @@ -13,7 +13,7 @@ @test findprev(DNA_A, kmer, 5) == 4 @test findprev(DNA_C, kmer, 5) == 2 @test findprev(DNA_G, kmer, 5) == 5 - @test findprev(DNA_T, kmer, 5) == nothing + @test findprev(DNA_T, kmer, 5) === nothing @test findprev(DNA_G, kmer, 4) == 3 @test findprev(DNA_A, kmer, 0) === nothing @@ -37,8 +37,8 @@ @test findnext(AA_T, kmer, 1) == 9 @test findnext(AA_F, kmer, 4) == 5 - @test findprev(AA_F, kmer, 4) == nothing - @test findnext(AA_A, kmer, 7) == nothing + @test findprev(AA_F, kmer, 4) === nothing + @test findnext(AA_A, kmer, 7) === nothing @test findnext(AA_M, kmer, 5) == 8 @test findfirst(AA_M, kmer) == 2