From e1918ce3ed8fc8ac7fbc8f81dd9072e764f40644 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 22 Dec 2024 11:35:31 +0100 Subject: [PATCH] Kmers.jl compatibility (#282) Add various small improvements to BioSequences to make it compatible with the upcoming Kmers.jl release. All of these changes are improved internals. * Improve showerror for EncodeError * Improve test in has_interface * Add generic BioSequence methods for bits_per_symbol, firstbitindex and lastbitindex * Add some more reversebits methods for different bits per symbol sizes --- Project.toml | 2 +- docs/src/construction.md | 2 +- src/alphabet.jl | 10 ++++++++- src/biosequence/biosequence.jl | 5 +++-- src/biosequence/indexing.jl | 3 +++ src/bit-manipulation/bit-manipulation.jl | 26 ++++++++++++++++++++---- src/longsequences/constructors.jl | 2 +- src/longsequences/indexing.jl | 3 --- test/biosequences/misc.jl | 17 ++++++++++++++++ 9 files changed, 57 insertions(+), 13 deletions(-) diff --git a/Project.toml b/Project.toml index ae562aeb..523d0a1d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BioSequences" uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" authors = ["Sabrina Jaye Ward ", "Jakob Nissen "] -version = "3.4.0" +version = "3.4.1" [deps] BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" diff --git a/docs/src/construction.md b/docs/src/construction.md index 63f2f4ce..d3bae005 100644 --- a/docs/src/construction.md +++ b/docs/src/construction.md @@ -317,7 +317,7 @@ If the input cannot be encoded by any of the built-in alphabets, an error is thr ```jldoctest julia> bioseq("0!(CC!;#&&%") -ERROR: cannot encode 0x30 in AminoAcidAlphabet +ERROR: cannot encode 0x30 (Char '0') in AminoAcidAlphabet [...] ``` diff --git a/src/alphabet.jl b/src/alphabet.jl index bd6ac362..3175ac72 100644 --- a/src/alphabet.jl +++ b/src/alphabet.jl @@ -135,7 +135,15 @@ end EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val) function Base.showerror(io::IO, err::EncodeError{A}) where {A} - print(io, "cannot encode ", repr(err.val), " in ", A) + val = err.val + char_repr = if val isa Integer && val < 0x80 + repr(val) * " (Char '" * Char(val) * "')" + elseif val isa Union{AbstractString, AbstractChar} + repr(val) + else + string(err.val) + end + print(io, "cannot encode " * char_repr * " in ", A) end """ diff --git a/src/biosequence/biosequence.jl b/src/biosequence/biosequence.jl index ba1fa375..bb75dc68 100644 --- a/src/biosequence/biosequence.jl +++ b/src/biosequence/biosequence.jl @@ -74,7 +74,7 @@ function has_interface( isempty(syms) && error("Vector syms must not be empty") first(syms) isa eltype(T) || error("Vector is of wrong element type") seq = T((i for i in syms)) - length(seq) > 0 || return false + length(seq) == length(syms) || return false eachindex(seq) === Base.OneTo(length(seq)) || return false E = encoded_data_eltype(T) e = extract_encoded_element(seq, 1) @@ -101,13 +101,14 @@ Base.nextind(::BioSequence, i::Integer) = Int(i) + 1 Base.prevind(::BioSequence, i::Integer) = Int(i) - 1 Base.size(x::BioSequence) = (length(x),) Base.eltype(::Type{<:BioSequence{A}}) where {A <: Alphabet} = eltype(A) -Base.eltype(x::BioSequence) = eltype(typeof(x)) Alphabet(::Type{<:BioSequence{A}}) where {A <: Alphabet} = A() Alphabet(x::BioSequence) = Alphabet(typeof(x)) Base.isempty(x::BioSequence) = iszero(length(x)) Base.empty(::Type{T}) where {T <: BioSequence} = T(eltype(T)[]) Base.empty(x::BioSequence) = empty(typeof(x)) BitsPerSymbol(x::BioSequence) = BitsPerSymbol(Alphabet(typeof(x))) +bits_per_symbol(::Type{T}) where {T <: BioSequence} = bits_per_symbol(Alphabet(T)) +bits_per_symbol(x::BioSequence) = bits_per_symbol(typeof(x)) Base.hash(s::BioSequence, x::UInt) = foldl((a, b) -> hash(b, a), s, init=x) function Base.similar(seq::BioSequence, len::Integer=length(seq)) diff --git a/src/biosequence/indexing.jl b/src/biosequence/indexing.jl index 4a87059c..c2634965 100644 --- a/src/biosequence/indexing.jl +++ b/src/biosequence/indexing.jl @@ -11,6 +11,9 @@ (i % UInt) - 1 < (lastindex(seq) % UInt) ? (@inbounds seq[i], i + 1) : nothing end +lastbitindex(x::BioSequence) = bitindex(x, lastindex(x)) +firstbitindex(x::BioSequence) = bitindex(x, firstindex(x)) + ## Bounds checking function Base.checkbounds(x::BioSequence, i::Integer) firstindex(x) ≤ i ≤ lastindex(x) || throw(BoundsError(x, i)) diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl index ca3ae7d2..72114ae7 100644 --- a/src/bit-manipulation/bit-manipulation.jl +++ b/src/bit-manipulation/bit-manipulation.jl @@ -1,16 +1,34 @@ -@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: Base.BitUnsigned +const BitUnsigned = Union{UInt8, UInt16, UInt32, UInt64, UInt128} + +@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: BitUnsigned mask = 0x33333333333333333333333333333333 % T x = ((x >> 2) & mask) | ((x & mask) << 2) return reversebits(x, BitsPerSymbol{4}()) end -@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: Base.BitUnsigned +@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: BitUnsigned mask = 0x0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F % T x = ((x >> 4) & mask) | ((x & mask) << 4) - return bswap(x) + return reversebits(x, BitsPerSymbol{8}()) +end + +@inline reversebits(x::T, ::BitsPerSymbol{8}) where T <: BitUnsigned = bswap(x) + +@inline reversebits(x::UInt16, ::BitsPerSymbol{16}) = x +@inline function reversebits(x::T, ::BitsPerSymbol{16}) where T <: Union{UInt32, UInt64} + mask = 0x0000FFFF0000FFFF0000FFFF0000FFFF % T + x = ((x >> 16) & mask) | ((x & mask) << 16) + reversebits(x, BitsPerSymbol{32}()) +end + +@inline reversebits(x::UInt32, ::BitsPerSymbol{32}) = x +@inline function reversebits(x::T, ::BitsPerSymbol{32}) where T <: Union{UInt64} + mask = 0x00000000FFFFFFF00000000FFFFFFFF % T + x = ((x >> 32) & mask) | ((x & mask) << 32) + reversebits(x, BitsPerSymbol{64}()) end -reversebits(x::T, ::BitsPerSymbol{8}) where T <: Base.BitUnsigned = bswap(x) +@inline reversebits(x::UInt64, ::BitsPerSymbol{64}) = x @inline function complement_bitpar(x::Unsigned, ::T) where {T<:NucleicAcidAlphabet{2}} return ~x diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl index 2054ce6f..2a2ec66a 100644 --- a/src/longsequences/constructors.jl +++ b/src/longsequences/constructors.jl @@ -131,7 +131,7 @@ julia> bioseq("UAUGCUGUAGG") UAUGCUGUAGG julia> bioseq("PKMW#3>>0;kL") -ERROR: cannot encode 0x23 in AminoAcidAlphabet +ERROR: cannot encode 0x23 (Char '#') in AminoAcidAlphabet [...] ``` """ diff --git a/src/longsequences/indexing.jl b/src/longsequences/indexing.jl index 183c559c..94ceffad 100644 --- a/src/longsequences/indexing.jl +++ b/src/longsequences/indexing.jl @@ -10,9 +10,6 @@ bitindex(N, encoded_data_eltype(typeof(x)), i) end -firstbitindex(s::SeqOrView) = bitindex(s, firstindex(s) % UInt) -lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s) % UInt) - @inline function extract_encoded_element(x::SeqOrView, i::Integer) bi = bitindex(x, i % UInt) extract_encoded_element(bi, x.data) diff --git a/test/biosequences/misc.jl b/test/biosequences/misc.jl index 38ba5684..3f7ab1bc 100644 --- a/test/biosequences/misc.jl +++ b/test/biosequences/misc.jl @@ -193,4 +193,21 @@ end @test ungap(seq) == seq cp = copy(seq) @test ungap!(seq) == cp +end + +# Ideally, we'd not test this internal function, but instead the code that +# relies on this, but this is only called if you have custom alphabets, and +# creating these alphabets of different sizes is a hassle +@testset "bitreverse" begin + bps8 = BioSequences.BitsPerSymbol{8}() + bps16 = BioSequences.BitsPerSymbol{16}() + bps32 = BioSequences.BitsPerSymbol{32}() + bps64 = BioSequences.BitsPerSymbol{64}() + reversebits = BioSequences.reversebits + @test reversebits(0x0102, bps16) === 0x0102 + @test reversebits(0x01020304, bps16) === 0x03040102 + @test reversebits(0x0102030405060708, bps16) === 0x0708050603040102 + @test reversebits(0x01020304, bps32) === 0x01020304 + @test reversebits(0x0102030405060708, bps32) === 0x0506070801020304 + @test reversebits(0x0102030405060708, bps64) === 0x0102030405060708 end \ No newline at end of file