From e95172a1d505203d8ec0f5d18b65e3e29e5a5bbe Mon Sep 17 00:00:00 2001
From: "Sabrina J. Ward" <sabrinajward@protonmail.com>
Date: Sun, 18 Sep 2022 18:52:02 +0100
Subject: [PATCH 1/2] Better conversion from 2 to 4 bit nucleic kmers

---
 src/constructors.jl      | 328 +++++++++++++++++++++++++++++++++++++++
 src/kmer.jl              | 325 +++-----------------------------------
 src/revtrans.jl          |   1 -
 src/tuple_bitflipping.jl |  74 ++++++++-
 4 files changed, 425 insertions(+), 303 deletions(-)
 create mode 100644 src/constructors.jl

diff --git a/src/constructors.jl b/src/constructors.jl
new file mode 100644
index 0000000..4b6ea56
--- /dev/null
+++ b/src/constructors.jl
@@ -0,0 +1,328 @@
+###
+### Constructors for Kmer types
+###
+
+#=
+These are (hopefully!) very optimised kernel functions for building kmer internal
+data from individual elements or from sequences. Kmers themselves are static,
+tuple-based structs, and so I really didn't want these functions to create memory
+allocations or GC activity through use of vectors an such, for what should be
+the creation of a single, rather simple value.
+=#
+
+"""
+    _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N}
+
+Construct a ntuple of the bits data for an instance of a Kmer{A,K,N}.
+
+This particular method is specialised for LongSequences, and for when the Kmer
+and LongSequence types used, share the same alphabet, since a lot of encoding /
+decoding can be skipped, and the problem is mostly one of shunting bits around.
+"""
+@inline function _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N}
+    checkmer(Kmer{A,K,N})
+    
+    bits_per_sym = BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold.
+    n_head = elements_in_head(Kmer{A,K,N}) # Based on kmer type, should constant fold.
+    n_per_chunk = per_word_capacity(Kmer{A,K,N}) # Based on kmer type, should constant fold.
+    
+    if from + K - 1 > length(seq)
+        return nothing
+    end
+    
+    # Construct the head.
+    head = zero(UInt64)
+    @inbounds for i in from:(from + n_head - 1)
+        bits = UInt64(BioSequences.extract_encoded_element(seq, i))
+        head = (head << bits_per_sym) | bits
+    end
+    
+    # And the rest of the sequence
+    idx = Ref(from + n_head)
+    tail = ntuple(Val{N - 1}()) do i
+        Base.@_inline_meta
+        body = zero(UInt64)
+        @inbounds for _ in 1:n_per_chunk
+            bits = UInt64(BioSequences.extract_encoded_element(seq, idx[]))
+            body = (body << bits_per_sym) | bits
+            idx[] += 1
+        end
+        return body
+    end
+    
+    # Put head and tail together
+    return (head, tail...)
+end
+
+"""
+    Kmer{A,K,N}(itr) where {A,K,N}
+
+Construct a `Kmer{A,K,N}` from an iterable.
+
+The most generic constructor.
+
+Currently the iterable must have `length` & support `getindex` with integers.
+
+# Examples
+
+```jldoctest
+julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet
+5nt DNA Sequence:
+TTAGC
+
+julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet
+DNA 5-mer:
+TTAGC
+```
+"""
+function Kmer{A,K,N}(itr) where {A,K,N}
+    checkmer(Kmer{A,K,N})
+    
+    seqlen = length(itr)
+    if seqlen != K
+        throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)"))
+    end
+    
+    ## All based on alphabet type of Kmer, so should constant fold.
+    bits_per_sym = BioSequences.bits_per_symbol(A())
+    n_head = elements_in_head(Kmer{A,K,N})
+    n_per_chunk = per_word_capacity(Kmer{A,K,N})
+    
+    # Construct the head.
+    head = zero(UInt64)
+    @inbounds for i in 1:n_head
+        sym = convert(eltype(Kmer{A,K,N}), itr[i])
+        # Encode will throw if it cant encode an element.
+        head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym))
+    end
+    
+    # And the rest of the sequence
+    idx = Ref(n_head + 1)
+    tail = ntuple(Val{N - 1}()) do i
+        Base.@_inline_meta
+        body = zero(UInt64)
+        @inbounds for i in 1:n_per_chunk
+            sym = convert(eltype(Kmer{A,K,N}), itr[idx[]])
+            # Encode will throw  if it cant encode an element.
+            body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym))
+            idx[] += 1
+        end
+        return body
+    end
+    
+    data = (head, tail...)
+    
+    return Kmer{A,K,N}(data)
+end
+
+"""
+    Kmer{A,K,N}(seq::BioSequence{A})
+
+Construct a `Kmer{A,K,N}` from a `BioSequence{A}`.
+
+This particular method is specialised for BioSequences, and for when the Kmer
+and BioSequence types used, share the same alphabet, since a lot of encoding /
+decoding can be skipped, and the problem is mostly one of shunting bits around.
+In the case where the alphabet of the Kmer and the alphabet of the BioSequence
+differ, dispatch to the more generic constructor occurs instead.
+
+# Examples
+
+```jldoctest
+julia> ntseq = LongSequence{DNAAlphabet{2}}("TTAGC") # 2-bit DNA alphabet
+5nt DNA Sequence:
+TTAGC
+
+julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet
+DNA 5-mer:
+TTAGC
+```
+"""
+@inline function Kmer{A,K,N}(seq::BioSequence{A}) where {A,K,N}
+    checkmer(Kmer{A,K,N})
+    
+    seqlen = length(seq)
+    if seqlen != K
+        throw(ArgumentError("seq is not the correct length ($seqlen ≠ $K)"))
+    end
+    
+    ## All based on alphabet type of Kmer, so should constant fold.
+    bits_per_sym = BioSequences.bits_per_symbol(A())
+    n_head = elements_in_head(Kmer{A,K,N})
+    n_per_chunk = per_word_capacity(Kmer{A,K,N})
+    
+    # Construct the head.
+    head = zero(UInt64)
+    @inbounds for i in 1:n_head
+        bits = UInt64(BioSequences.extract_encoded_element(seq, i))
+        head = (head << bits_per_sym) | bits
+    end
+    
+    # And the rest of the sequence
+    idx = Ref(n_head + 1)
+    tail = ntuple(Val{N - 1}()) do i
+        Base.@_inline_meta
+        body = zero(UInt64)
+        @inbounds for _ in 1:n_per_chunk
+            bits = UInt64(BioSequences.extract_encoded_element(seq, idx[]))
+            body = (body << bits_per_sym) | bits
+            idx[] += 1
+        end
+        return body
+    end
+    
+    data = (head, tail...)
+    
+    return Kmer{A,K,N}(data)
+end
+
+
+"""
+    Kmer{A,K,n}(x::Kmer{B,K,N}) where {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N,n}
+
+A more optimal method of constructing four-bit encoded nucleic
+acid kmers, from two-bit encoded nucleic acid kmers. 
+"""
+@inline function Kmer{A,K,n}(x::Kmer{B,K,N}) where 
+    {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N,n}
+    checkmer(Kmer{A,K,n})
+    newbits = transcode_bits(x.data, B(), A())
+    clippedbits = _drophead(newbits, Val{length(newbits) - n}())
+    return Kmer{A,K,n}(clippedbits)
+end
+
+
+"""
+    Kmer{A,K}(x::Kmer{B,K,N}) where {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N}
+
+A more optimal method of constructing four-bit encoded nucleic
+acid kmers, from two-bit encoded nucleic acid kmers.
+
+Works out appropriate N for the output type as a convenience.
+"""
+@inline function Kmer{A,K}(x::Kmer{B,K,N}) where
+    {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N}
+   return kmertype(Kmer{A,K})(x) 
+end
+
+
+"""
+    Kmer{A}(x::Kmer{B,K,N}) where {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N}
+
+A more optimal method of constructing four-bit encoded nucleic
+acid kmers, from two-bit encoded nucleic acid kmers.
+
+Works out appropriate K and N for output type as a convenience.
+"""
+@inline function Kmer{A}(x::Kmer{B,K,N}) where
+    {A<:NucleicAcidAlphabet{4},B<:NucleicAcidAlphabet{2},K,N}
+    return Kmer{A,K}(x)
+end
+
+
+
+
+
+
+
+
+# Convenience version of function above so you don't have to work out correct N.
+"""
+    Kmer{A,K}(itr) where {A,K}
+
+Construct a `Kmer{A,K,N}` from an iterable.
+
+This is a convenience method which will work out the correct `N` parameter, for
+your given choice of `A` & `K`.
+"""
+@inline function Kmer{A,K}(itr) where {A,K}
+    T = kmertype(Kmer{A,K})
+    return T(itr)
+end
+
+"""
+    Kmer{A}(itr) where {A}
+
+Construct a `Kmer{A,K,N}` from an iterable.
+
+This is a convenience method which will work out K from the length of `itr`, and
+the correct `N` parameter, for your given choice of `A` & `K`.
+
+!!! warning
+    Since this gets K from runtime values, this is gonna be slow!
+"""
+@inline Kmer{A}(itr) where {A} = Kmer{A,length(itr)}(itr)
+@inline Kmer(seq::BioSequence{A}) where A = Kmer{A}(seq)
+
+function Kmer{A1}(seq::BioSequence{A2}) where {A1 <: NucleicAcidAlphabet, A2 <: NucleicAcidAlphabet}
+    kmertype(Kmer{A1, length(seq)})(seq)
+end
+
+@inline function Kmer{A}(nts::Vararg{Union{DNA, RNA}, K}) where {A <: NucleicAcidAlphabet, K}
+    return kmertype(Kmer{A, K})(nts)
+end
+
+"""
+    Kmer(nts::Vararg{DNA,K}) where {K}
+
+Construct a Kmer from a variable number `K` of DNA nucleotides.
+
+# Examples
+
+```jldoctest
+julia> Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)
+DNA 5-mer:
+TTAGC
+```
+"""
+@inline Kmer(nt::DNA, nts::Vararg{DNA}) = DNAKmer((nt, nts...))
+
+"""
+    Kmer(nts::Vararg{RNA,K}) where {K}
+
+Construct a Kmer from a variable number `K` of RNA nucleotides.
+
+# Examples
+
+```jldoctest
+julia> Kmer(RNA_U, RNA_U, RNA_A, RNA_G, RNA_C)
+DNA 5-mer:
+UUAGC
+```
+"""
+@inline Kmer(nt::RNA, nts::Vararg{RNA}) = RNAKmer((nt, nts...))
+
+
+"""
+    Kmer(seq::String)
+
+Construct a DNA or RNA kmer from a string.
+
+!!! warning
+    As a convenience method, this derives the `K`, `Alphabet`, and `N` parameters
+    for the `Kmer{A,K,N}` type from the input string.
+
+# Examples
+
+```jldoctest
+julia> Kmer("TTAGC")
+DNA 5-mer:
+TTAGC
+```
+"""
+@inline function Kmer(seq::String)
+    seq′ = BioSequences.remove_newlines(seq)
+    hast = false
+    hasu = false
+    for c in seq′
+        hast |= ((c == 'T') | (c == 't'))
+        hasu |= ((c == 'U') | (c == 'u'))
+    end
+    if (hast & hasu) | (!hast & !hasu)
+        throw(ArgumentError("Can't detect alphabet type from string"))
+    end
+    A = ifelse(hast & !hasu, DNAAlphabet{2}, RNAAlphabet{2})
+    return Kmer{A,length(seq′)}(seq′)
+end
+
+
diff --git a/src/kmer.jl b/src/kmer.jl
index 62f108c..a66711a 100644
--- a/src/kmer.jl
+++ b/src/kmer.jl
@@ -57,289 +57,47 @@ BioSequences.encoded_data(seq::Kmer{A,K,N}) where {A,K,N} = seq.data
 # Create a blank ntuple of appropriate length for a given Kmer with N.
 @inline blank_ntuple(::Type{Kmer{A,K,N}}) where {A,K,N} = ntuple(x -> zero(UInt64), Val{N}())
 
-###
-### _build_kmer_data
-###
-
-#=
-These are (hopefully!) very optimised kernel functions for building kmer internal
-data from individual elements or from sequences. Kmers themselves are static,
-tuple-based structs, and so I really didn't want these functions to create memory
-allocations or GC activity through use of vectors an such, for what should be
-the creation of a single, rather simple value.
-=#
-
-"""
-    _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N}
-
-Construct a ntuple of the bits data for an instance of a Kmer{A,K,N}.
-
-This particular method is specialised for LongSequences, and for when the Kmer
-and LongSequence types used, share the same alphabet, since a lot of encoding /
-decoding can be skipped, and the problem is mostly one of shunting bits around.
-"""
-@inline function _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N}
-    checkmer(Kmer{A,K,N})
-    
-    bits_per_sym = BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold.
-    n_head = elements_in_head(Kmer{A,K,N}) # Based on kmer type, should constant fold.
-    n_per_chunk = per_word_capacity(Kmer{A,K,N}) # Based on kmer type, should constant fold.
-    
-    if from + K - 1 > length(seq)
-        return nothing
-    end
-    
-    # Construct the head.
-    head = zero(UInt64)
-    @inbounds for i in from:(from + n_head - 1)
-        bits = UInt64(BioSequences.extract_encoded_element(seq, i))
-        head = (head << bits_per_sym) | bits
-    end
-    
-    # And the rest of the sequence
-    idx = Ref(from + n_head)
-    tail = ntuple(Val{N - 1}()) do i
-        Base.@_inline_meta
-        body = zero(UInt64)
-        @inbounds for _ in 1:n_per_chunk
-            bits = UInt64(BioSequences.extract_encoded_element(seq, idx[]))
-            body = (body << bits_per_sym) | bits
-            idx[] += 1
-        end
-        return body
-    end
-    
-    # Put head and tail together
-    return (head, tail...)
-end
-
-
-
-###
-### Constructors
-###
-
-"""
-    Kmer{A,K,N}(itr) where {A,K,N}
-
-Construct a `Kmer{A,K,N}` from an iterable.
-
-The most generic constructor.
-
-Currently the iterable must have `length` & support `getindex` with integers.
-
-# Examples
-
-```jldoctest
-julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet
-5nt DNA Sequence:
-TTAGC
-
-julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet
-DNA 5-mer:
-TTAGC
-```
-"""
-function Kmer{A,K,N}(itr) where {A,K,N}
-    checkmer(Kmer{A,K,N})
-    
-    seqlen = length(itr)
-    if seqlen != K
-        throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)"))
-    end
-    
-    ## All based on alphabet type of Kmer, so should constant fold.
-    bits_per_sym = BioSequences.bits_per_symbol(A())
-    n_head = elements_in_head(Kmer{A,K,N})
-    n_per_chunk = per_word_capacity(Kmer{A,K,N})
-    
-    # Construct the head.
-    head = zero(UInt64)
-    @inbounds for i in 1:n_head
-        sym = convert(eltype(Kmer{A,K,N}), itr[i])
-        # Encode will throw if it cant encode an element.
-        head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym))
-    end
-    
-    # And the rest of the sequence
-    idx = Ref(n_head + 1)
-    tail = ntuple(Val{N - 1}()) do i
-        Base.@_inline_meta
-        body = zero(UInt64)
-        @inbounds for i in 1:n_per_chunk
-            sym = convert(eltype(Kmer{A,K,N}), itr[idx[]])
-            # Encode will throw  if it cant encode an element.
-            body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym))
-            idx[] += 1
-        end
-        return body
-    end
-    
-    data = (head, tail...)
-    
-    return Kmer{A,K,N}(data)
-end
-
-"""
-    Kmer{A,K,N}(seq::BioSequence{A})
-
-Construct a `Kmer{A,K,N}` from a `BioSequence{A}`.
-
-This particular method is specialised for BioSequences, and for when the Kmer
-and BioSequence types used, share the same alphabet, since a lot of encoding /
-decoding can be skipped, and the problem is mostly one of shunting bits around.
-In the case where the alphabet of the Kmer and the alphabet of the BioSequence
-differ, dispatch to the more generic constructor occurs instead.
-
-# Examples
-
-```jldoctest
-julia> ntseq = LongSequence{DNAAlphabet{2}}("TTAGC") # 2-bit DNA alphabet
-5nt DNA Sequence:
-TTAGC
-
-julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet
-DNA 5-mer:
-TTAGC
-```
-"""
-@inline function Kmer{A,K,N}(seq::BioSequence{A}) where {A,K,N}
-    checkmer(Kmer{A,K,N})
-    
-    seqlen = length(seq)
-    if seqlen != K
-        throw(ArgumentError("seq is not the correct length ($seqlen ≠ $K)"))
-    end
-    
-    ## All based on alphabet type of Kmer, so should constant fold.
-    bits_per_sym = BioSequences.bits_per_symbol(A())
-    n_head = elements_in_head(Kmer{A,K,N})
-    n_per_chunk = per_word_capacity(Kmer{A,K,N})
-    
-    # Construct the head.
-    head = zero(UInt64)
-    @inbounds for i in 1:n_head
-        bits = UInt64(BioSequences.extract_encoded_element(seq, i))
-        head = (head << bits_per_sym) | bits
-    end
-    
-    # And the rest of the sequence
-    idx = Ref(n_head + 1)
-    tail = ntuple(Val{N - 1}()) do i
-        Base.@_inline_meta
-        body = zero(UInt64)
-        @inbounds for _ in 1:n_per_chunk
-            bits = UInt64(BioSequences.extract_encoded_element(seq, idx[]))
-            body = (body << bits_per_sym) | bits
-            idx[] += 1
-        end
-        return body
-    end
-    
-    data = (head, tail...)
-    
-    return Kmer{A,K,N}(data)
-end
-
-
-# Convenience version of function above so you don't have to work out correct N.
-"""
-    Kmer{A,K}(itr) where {A,K}
-
-Construct a `Kmer{A,K,N}` from an iterable.
-
-This is a convenience method which will work out the correct `N` parameter, for
-your given choice of `A` & `K`.
-"""
-@inline function Kmer{A,K}(itr) where {A,K}
-    T = kmertype(Kmer{A,K})
-    return T(itr)
-end
-
-"""
-    Kmer{A}(itr) where {A}
-
-Construct a `Kmer{A,K,N}` from an iterable.
-
-This is a convenience method which will work out K from the length of `itr`, and
-the correct `N` parameter, for your given choice of `A` & `K`.
+# Aliases
+"Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`"
+const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N}
 
-!!! warning
-    Since this gets K from runtime values, this is gonna be slow!
-"""
-@inline Kmer{A}(itr) where {A} = Kmer{A,length(itr)}(itr)
-@inline Kmer(seq::BioSequence{A}) where A = Kmer{A}(seq)
+"Shortcut for the type `DNAKmer{27,1}`"
+const DNA27mer = DNAKmer{27,1}
 
-function Kmer{A1}(seq::BioSequence{A2}) where {A1 <: NucleicAcidAlphabet, A2 <: NucleicAcidAlphabet}
-    kmertype(Kmer{A1, length(seq)})(seq)
-end
+"Shortcut for the type `DNAKmer{31,1}`"
+const DNA31mer = DNAKmer{31,1}
 
-@inline function Kmer{A}(nts::Vararg{Union{DNA, RNA}, K}) where {A <: NucleicAcidAlphabet, K}
-    return kmertype(Kmer{A, K})(nts)
-end
+"Shortcut for the type `DNAKmer{63,2}`"
+const DNA63mer = DNAKmer{63,2}
 
-"""
-    Kmer(nts::Vararg{DNA,K}) where {K}
+"Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`"
+const RNAKmer{K,N} = Kmer{RNAAlphabet{2},K,N}
 
-Construct a Kmer from a variable number `K` of DNA nucleotides.
+"Shortcut for the type `RNAKmer{27,1}`"
+const RNA27mer = RNAKmer{27,1}
 
-# Examples
+"Shortcut for the type `RNAKmer{31,1}`"
+const RNA31mer = RNAKmer{31,1}
 
-```jldoctest
-julia> Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)
-DNA 5-mer:
-TTAGC
-```
-"""
-@inline Kmer(nt::DNA, nts::Vararg{DNA}) = DNAKmer((nt, nts...))
+"Shortcut for the type `RNAKmer{63,2}`"
+const RNA63mer = RNAKmer{63,2}
 
-"""
-    Kmer(nts::Vararg{RNA,K}) where {K}
+"Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`"
+const AAKmer{K,N} = Kmer{AminoAcidAlphabet,K,N}
 
-Construct a Kmer from a variable number `K` of RNA nucleotides.
+"Shorthand for `DNAKmer{3,1}`"
+const DNACodon = DNAKmer{3,1}
 
-# Examples
+"Shorthand for `RNAKmer{3,1}`"
+const RNACodon = RNAKmer{3,1}
 
-```jldoctest
-julia> Kmer(RNA_U, RNA_U, RNA_A, RNA_G, RNA_C)
-DNA 5-mer:
-UUAGC
-```
-"""
-@inline Kmer(nt::RNA, nts::Vararg{RNA}) = RNAKmer((nt, nts...))
 
 
-"""
-    Kmer(seq::String)
 
-Construct a DNA or RNA kmer from a string.
 
-!!! warning
-    As a convenience method, this derives the `K`, `Alphabet`, and `N` parameters
-    for the `Kmer{A,K,N}` type from the input string.
+include("constructors.jl")
 
-# Examples
 
-```jldoctest
-julia> Kmer("TTAGC")
-DNA 5-mer:
-TTAGC
-```
-"""
-@inline function Kmer(seq::String)
-    seq′ = BioSequences.remove_newlines(seq)
-    hast = false
-    hasu = false
-    for c in seq′
-        hast |= ((c == 'T') | (c == 't'))
-        hasu |= ((c == 'U') | (c == 'u'))
-    end
-    if (hast & hasu) | (!hast & !hasu)
-        throw(ArgumentError("Can't detect alphabet type from string"))
-    end
-    A = ifelse(hast & !hasu, DNAAlphabet{2}, RNAAlphabet{2})
-    return Kmer{A,length(seq′)}(seq′)
-end
 
 
 """
@@ -359,41 +117,6 @@ Kmer{DNAAlphabet{2},63,2}
 end
 @inline kmertype(::Type{Kmer{A,K,N}}) where {A,K,N} = Kmer{A,K,N}
 
-# Aliases
-"Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`"
-const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N}
-
-"Shortcut for the type `DNAKmer{27,1}`"
-const DNA27mer = DNAKmer{27,1}
-
-"Shortcut for the type `DNAKmer{31,1}`"
-const DNA31mer = DNAKmer{31,1}
-
-"Shortcut for the type `DNAKmer{63,2}`"
-const DNA63mer = DNAKmer{63,2}
-
-"Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`"
-const RNAKmer{K,N} = Kmer{RNAAlphabet{2},K,N}
-
-"Shortcut for the type `RNAKmer{27,1}`"
-const RNA27mer = RNAKmer{27,1}
-
-"Shortcut for the type `RNAKmer{31,1}`"
-const RNA31mer = RNAKmer{31,1}
-
-"Shortcut for the type `RNAKmer{63,2}`"
-const RNA63mer = RNAKmer{63,2}
-
-"Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`"
-const AAKmer{K,N} = Kmer{AminoAcidAlphabet,K,N}
-
-"Shorthand for `DNAKmer{3,1}`"
-const DNACodon = DNAKmer{3,1}
-
-"Shorthand for `RNAKmer{3,1}`"
-const RNACodon = RNAKmer{3,1}
-
-
 ###
 ### Base Functions
 ###
diff --git a/src/revtrans.jl b/src/revtrans.jl
index 4e8dc2e..fdf7db9 100644
--- a/src/revtrans.jl
+++ b/src/revtrans.jl
@@ -1,4 +1,3 @@
-struct Unsafe end
 
 const N_AA = length(AminoAcidAlphabet())
 
diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl
index 4476956..a851467 100644
--- a/src/tuple_bitflipping.jl
+++ b/src/tuple_bitflipping.jl
@@ -25,6 +25,25 @@ Notably it's used when constructing a Kmer from an existing NTuple of UInt64
     return (head & (typemax(UInt64) >> by), tail...)
 end
 
+
+"""
+    _drophead(x::NTuple{N,UInt64}, drop::Val{D}) where {N,D}
+
+A function used to drop the first N elements from an NTuple.
+
+Intended for internal bit-flipping use.
+
+Notably used in cases where expanding a 2-bit encoding to
+a 4-bit encoding results in an unused element at the head
+of the tuple. 
+"""
+@inline function _drophead(x::NTuple{N,UInt64}, drop::Val{D}) where {N,D}
+    ntuple(Val{N - D}()) do i
+        Base.@_inline_meta
+        return @inbounds x[D + i]
+    end
+end
+
 #=
 rightshift_carry & leftshift_carry
 
@@ -69,4 +88,57 @@ end
 end
 
 @inline _reverse(f::F, ::BioSequences.BitsPerSymbol{N}) where {N,F<:Function} = ()
-=#
\ No newline at end of file
+=#
+
+const transcode_2_to_4_nucs = (function ()
+
+    @info "Computing 2 -> 4 bit translation table..."
+
+    fourbitnucs = (UInt32(1), UInt32(2), UInt32(4), UInt32(8))
+
+    fn(bits, lshift) = fourbitnucs[((bits >> lshift) & 0x03) + 1] << (2 * lshift) 
+
+    function translate2to4bits(bits::UInt16)
+        fn(bits, 0)  |
+        fn(bits, 2)  |
+        fn(bits, 4)  |
+        fn(bits, 6)  |
+        fn(bits, 8)  |
+        fn(bits, 10) |
+        fn(bits, 12) |
+        fn(bits, 14) 
+    end
+
+    ntuple((x) -> translate2to4bits(UInt16(x - 1)), 65536)
+end)()
+
+@inline function transcode_kernel(bits::UInt64, from::A, to::B) where
+    {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4}}
+
+    @inbounds begin
+        b = UInt64(transcode_2_to_4_nucs[((bits >>  0) & 0xFFFF) + 1]) <<  0 |
+            UInt64(transcode_2_to_4_nucs[((bits >> 16) & 0xFFFF) + 1]) << 32
+
+        a = UInt64(transcode_2_to_4_nucs[((bits >> 32) & 0xFFFF) + 1]) <<  0 |
+            UInt64(transcode_2_to_4_nucs[((bits >> 48) & 0xFFFF) + 1]) << 32
+    end
+    return (a, b)
+end
+
+@inline function transcode_bits(x::NTuple{N,UInt64}, from::A, to::B) where
+    {N,A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4}}
+
+    return _transcode_bits(from, to, x...)
+end
+
+@inline function _transcode_bits(from::A, to::B, head::UInt64, tail...) where
+    {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4}}
+
+    return (transcode_kernel(head, from, to)..., _transcode_bits(from, to, tail...)...)
+end
+
+@inline function _transcode_bits(from::A, to::B, head::UInt64) where
+    {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4}}
+
+    return transcode_kernel(head, from, to)
+end

From dc1912da46ac5710e1aa74d83632f8dbecba9afe Mon Sep 17 00:00:00 2001
From: "Sabrina J. Ward" <sabrinajward@protonmail.com>
Date: Sun, 9 Apr 2023 01:53:05 +0100
Subject: [PATCH 2/2] I dunno some stuff I guess

---
 src/constructors.jl | 117 +++++++++++++++++++++++++++++++++-----------
 src/kmer.jl         |   3 --
 test/find.jl        |   8 +--
 3 files changed, 92 insertions(+), 36 deletions(-)

diff --git a/src/constructors.jl b/src/constructors.jl
index 4b6ea56..561fff7 100644
--- a/src/constructors.jl
+++ b/src/constructors.jl
@@ -54,56 +54,85 @@ decoding can be skipped, and the problem is mostly one of shunting bits around.
     return (head, tail...)
 end
 
-"""
-    Kmer{A,K,N}(itr) where {A,K,N}
+function _construct_from_iterable(::Type{Kmer{A,K,N}}, iter, isize::Base.SizeUnknown) where {A,K,N}
+    ## All based on alphabet type of Kmer, so should constant fold.
+    checkmer(Kmer{A,K,N})
+    ET = eltype(Kmer{A,K,N})
+    bits_per_sym = BioSequences.bits_per_symbol(A())
+    n_head = elements_in_head(Kmer{A,K,N})
+    n_per_chunk = per_word_capacity(Kmer{A,K,N})
 
-Construct a `Kmer{A,K,N}` from an iterable.
+    stateful = Iterators.Stateful(iter)
+    head_filled = 0
 
-The most generic constructor.
+    # Construct the head.
+    head = zero(UInt64)
+    next = Ref{Union{Tuple{ET,Nothing},Nothing}}(iterate(stateful))
+    while next[] !== nothing && head_filled < n_head
+        el, _ = next[]
+        sym = convert(ET, el)
+        # Encode will throw if it cant encode an element.
+        head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym))
+        head_filled += 1
+        next[] = iterate(stateful)
+    end
 
-Currently the iterable must have `length` & support `getindex` with integers.
+    if head_filled != n_head
+        error("Iterator did not yield enough elements to construct $K-mer")
+    end
 
-# Examples
+    tail = ntuple(Val{N - 1}()) do i
+        Base.@_inline_meta
+        chunk_filled = 0
+        body = zero(UInt64)
+        while next[] !== nothing && chunk_filled < n_per_chunk
+            el, _ = next[]
+            sym = convert(ET, el)
+            # Encode will throw  if it cant encode an element.
+            body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym))
+            chunk_filled += 1
+            next[] = iterate(stateful)
+        end
 
-```jldoctest
-julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet
-5nt DNA Sequence:
-TTAGC
+        if chunk_filled != n_per_chunk
+            error("Iterator did not yield enough elements to construct $K-mer")
+        end
 
-julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet
-DNA 5-mer:
-TTAGC
-```
-"""
-function Kmer{A,K,N}(itr) where {A,K,N}
-    checkmer(Kmer{A,K,N})
-    
-    seqlen = length(itr)
-    if seqlen != K
-        throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)"))
+        return body
     end
     
+    data = (head, tail...)
+    
+    return Kmer{A,K,N}(data)
+end
+
+function _construct_from_iterable(::Type{Kmer{A,K,N}}, iter, isize::Base.HasLength) where {A,K,N}
     ## All based on alphabet type of Kmer, so should constant fold.
+    checkmer(Kmer{A,K,N})
+    ET = eltype(Kmer{A,K,N})
     bits_per_sym = BioSequences.bits_per_symbol(A())
     n_head = elements_in_head(Kmer{A,K,N})
     n_per_chunk = per_word_capacity(Kmer{A,K,N})
+
+    ilen = length(iter)
+    if ilen != K
+        throw(ArgumentError("seq is not the correct length ($ilen ≠ $K)"))
+    end
     
     # Construct the head.
     head = zero(UInt64)
-    @inbounds for i in 1:n_head
-        sym = convert(eltype(Kmer{A,K,N}), itr[i])
-        # Encode will throw if it cant encode an element.
+    @inbounds for i in firstindex(iter):(firstindex(iter) + n_head - 1)
+        sym = convert(ET, iter[i])
         head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym))
     end
     
     # And the rest of the sequence
-    idx = Ref(n_head + 1)
-    tail = ntuple(Val{N - 1}()) do i
+    idx = Ref(firstindex(iter) + n_head)
+    tail = ntuple(Val{N - 1}()) do _
         Base.@_inline_meta
         body = zero(UInt64)
-        @inbounds for i in 1:n_per_chunk
-            sym = convert(eltype(Kmer{A,K,N}), itr[idx[]])
-            # Encode will throw  if it cant encode an element.
+        @inbounds for _ in 1:n_per_chunk
+            sym = convert(ET, iter[i])
             body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym))
             idx[] += 1
         end
@@ -115,6 +144,36 @@ function Kmer{A,K,N}(itr) where {A,K,N}
     return Kmer{A,K,N}(data)
 end
 
+
+"""
+    Kmer{A,K,N}(itr) where {A,K,N}
+
+Construct a `Kmer{A,K,N}` from an iterable.
+
+The most generic constructor of a Kmer from any iterable type
+that implements the bare minimum Iterator interface.
+
+# Examples
+
+```jldoctest
+julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet
+5nt DNA Sequence:
+TTAGC
+
+julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet
+DNA 5-mer:
+TTAGC
+```
+"""
+function Kmer{A,K,N}(itr) where {A,K,N}
+    return _construct_from_iterable(Kmer{A,K,N}, itr, Base.IteratorSize(typeof(itr)))
+end
+
+
+
+
+
+
 """
     Kmer{A,K,N}(seq::BioSequence{A})
 
diff --git a/src/kmer.jl b/src/kmer.jl
index a66711a..ddc09ba 100644
--- a/src/kmer.jl
+++ b/src/kmer.jl
@@ -146,9 +146,6 @@ checked at compile time, and the branches / error throws eliminated when the
 parameterisation of the Kmer type is good. 
 """
 @inline function checkmer(::Type{Kmer{A,K,N}}) where {A,K,N}
-    if K < 1
-        throw(ArgumentError("Bad kmer parameterisation. K must be greater than 0."))
-    end
     n = BioSequences.seq_data_len(A, K)
     if n !== N
         # This has been significantly changed conceptually from before. Now we
diff --git a/test/find.jl b/test/find.jl
index c3d4aaa..f258ab0 100644
--- a/test/find.jl
+++ b/test/find.jl
@@ -4,7 +4,7 @@
     @test findnext(DNA_A, kmer, 1) == 1
     @test findnext(DNA_C, kmer, 1) == 2
     @test findnext(DNA_G, kmer, 1) == 3
-    @test findnext(DNA_T, kmer, 1) == nothing
+    @test findnext(DNA_T, kmer, 1) === nothing
     @test findnext(DNA_A, kmer, 2) == 4
 
     @test_throws BoundsError findnext(DNA_A, kmer, 0)
@@ -13,7 +13,7 @@
     @test findprev(DNA_A, kmer, 5) == 4
     @test findprev(DNA_C, kmer, 5) == 2
     @test findprev(DNA_G, kmer, 5) == 5
-    @test findprev(DNA_T, kmer, 5) == nothing
+    @test findprev(DNA_T, kmer, 5) === nothing
     @test findprev(DNA_G, kmer, 4) == 3
 
     @test findprev(DNA_A, kmer, 0) === nothing
@@ -37,8 +37,8 @@
     @test findnext(AA_T, kmer, 1) == 9
     
     @test findnext(AA_F, kmer, 4) == 5
-    @test findprev(AA_F, kmer, 4) == nothing
-    @test findnext(AA_A, kmer, 7) == nothing
+    @test findprev(AA_F, kmer, 4) === nothing
+    @test findnext(AA_A, kmer, 7) === nothing
     @test findnext(AA_M, kmer, 5) == 8
     
     @test findfirst(AA_M, kmer) == 2