From 94d28f85d656521ea10cc19cb244d3086622193e Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 23 Dec 2024 21:15:15 +0100 Subject: [PATCH 1/2] Make unsafe_shift_from(::Copyable) use whole chunks --- src/construction.jl | 85 --------------------------------------- src/construction_utils.jl | 38 +++++++++++++++++ 2 files changed, 38 insertions(+), 85 deletions(-) diff --git a/src/construction.jl b/src/construction.jl index 032c3aa..af8834c 100644 --- a/src/construction.jl +++ b/src/construction.jl @@ -109,91 +109,6 @@ include("construction_utils.jl") throw(BioSequences.EncodeError(A, reinterpret(T, enc % UInt8))) end -#= -"Extract a full kmer at a given index of a sequence. -Note: These methods don't do any bounds checking" -function unsafe_extract end - -@inline function unsafe_extract( - ::TwoToFour, - ::Type{T}, - seq::BioSequence, - from_index, -) where {T <: Kmer} - data = zero_tuple(T) - for i in from_index:(from_index + ksize(T) - 1) - encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) - (_, data) = leftshift_carry(data, 4, encoding) - end - T(unsafe, data) -end - -@inline function unsafe_extract( - ::FourToTwo, - ::Type{T}, - seq::BioSequence, - from_index, -) where {T <: Kmer} - data = zero_tuple(T) - for i in from_index:(from_index + ksize(T) - 1) - encoding = UInt(BioSequences.extract_encoded_element(seq, i))::UInt - isone(count_ones(encoding)) || throw_uncertain(Alphabet(T), eltype(seq), encoding) - (_, data) = leftshift_carry(data, 2, trailing_zeros(encoding) % UInt) - end - T(unsafe, data) -end - -@inline function unsafe_extract( - ::Copyable, - ::Type{T}, - seq::BioSequence, - from_index, -) where {T <: Kmer} - data = zero_tuple(T) - bps = BioSequences.bits_per_symbol(Alphabet(seq)) - for i in from_index:(from_index + ksize(T) - 1) - encoding = UInt(BioSequences.extract_encoded_element(seq, i))::UInt - (_, data) = leftshift_carry(data, bps, encoding) - end - T(unsafe, data) -end - -@inline function unsafe_extract( - ::AsciiEncode, - ::Type{T}, - seq::AbstractVector{UInt8}, - from_index, -) where {T <: Kmer} - data = zero_tuple(T) - bps = BioSequences.bits_per_symbol(Alphabet(T)) - @inbounds for i in from_index:(from_index + ksize(T) - 1) - byte = seq[i] - encoding = BioSequences.ascii_encode(Alphabet(T), byte) - if encoding > 0x7f - throw(BioSequences.EncodeError(Alphabet(T), byte)) - end - (_, data) = leftshift_carry(data, bps, encoding % UInt) - end - T(unsafe, data) -end - -@inline function unsafe_extract( - ::GenericRecoding, - ::Type{T}, - seq, - from_index, -) where {T <: Kmer} - data = zero_tuple(T) - bps = BioSequences.bits_per_symbol(Alphabet(T)) - @inbounds for i in 1:ksize(T) - symbol = convert(eltype(T), seq[i]) - encoding = UInt(BioSequences.encode(Alphabet(T), symbol)) - (_, data) = leftshift_carry(data, bps, encoding) - end - T(unsafe, data) -end -=# - ################################################ # Constructors with full parameterisation ################################################ diff --git a/src/construction_utils.jl b/src/construction_utils.jl index 3b6463d..cb619fe 100644 --- a/src/construction_utils.jl +++ b/src/construction_utils.jl @@ -68,6 +68,16 @@ end T(unsafe, data) end +# For this method, we can copy multiple symbols at once. +@inline function unsafe_extract( + ::Copyable, + ::Type{T}, + seq::Union{LongSequence, LongSubSeq}, + from_index, +) where {T <: Kmer} + unsafe_shift_from(Copyable(), zero_kmer(T), seq, from_index, Val{ksize(T)}()) +end + @inline function unsafe_extract( ::AsciiEncode, ::Type{T}, @@ -186,6 +196,34 @@ end kmer end +@inline function unsafe_shift_from( + ::Copyable, + kmer::Kmer, + seq::Union{LongSequence, LongSubSeq}, + from::Int, + ::Val{S}, +) where {S} + bps = BioSequences.bits_per_symbol(seq) + remaining = S + i = Int(from)::Int + data = kmer.data + while !iszero(remaining) + bi = BioSequences.bitindex(seq, i) + off = BioSequences.offset(bi) + element = @inbounds seq.data[BioSequences.index(bi)] + element = BioSequences.reversebits(element, BioSequences.BitsPerSymbol(seq)) + n_used_bits = min(remaining * bps, 64 - off) & 63 + n_used_symbols = div(n_used_bits, bps) + shift = 64 - (n_used_bits + off) + element >>>= (shift & 63) + element &= (UInt(1) << (n_used_bits)) - 1 + (_, data) = leftshift_carry(data, n_used_bits, element) + remaining -= n_used_symbols + i += n_used_symbols + end + typeof(kmer)(unsafe, (first(data) & get_mask(typeof(kmer)), Base.tail(data)...)) +end + @inline function unsafe_shift_from( ::TwoToFour, kmer::Kmer{<:NucleicAcidAlphabet{4}}, From cbd290158e1bb1d0818b0622a9e9de326b8d61c1 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Thu, 26 Dec 2024 16:34:23 +0100 Subject: [PATCH 2/2] Scratch first attempt --- src/construction_utils.jl | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/src/construction_utils.jl b/src/construction_utils.jl index cb619fe..3b6463d 100644 --- a/src/construction_utils.jl +++ b/src/construction_utils.jl @@ -68,16 +68,6 @@ end T(unsafe, data) end -# For this method, we can copy multiple symbols at once. -@inline function unsafe_extract( - ::Copyable, - ::Type{T}, - seq::Union{LongSequence, LongSubSeq}, - from_index, -) where {T <: Kmer} - unsafe_shift_from(Copyable(), zero_kmer(T), seq, from_index, Val{ksize(T)}()) -end - @inline function unsafe_extract( ::AsciiEncode, ::Type{T}, @@ -196,34 +186,6 @@ end kmer end -@inline function unsafe_shift_from( - ::Copyable, - kmer::Kmer, - seq::Union{LongSequence, LongSubSeq}, - from::Int, - ::Val{S}, -) where {S} - bps = BioSequences.bits_per_symbol(seq) - remaining = S - i = Int(from)::Int - data = kmer.data - while !iszero(remaining) - bi = BioSequences.bitindex(seq, i) - off = BioSequences.offset(bi) - element = @inbounds seq.data[BioSequences.index(bi)] - element = BioSequences.reversebits(element, BioSequences.BitsPerSymbol(seq)) - n_used_bits = min(remaining * bps, 64 - off) & 63 - n_used_symbols = div(n_used_bits, bps) - shift = 64 - (n_used_bits + off) - element >>>= (shift & 63) - element &= (UInt(1) << (n_used_bits)) - 1 - (_, data) = leftshift_carry(data, n_used_bits, element) - remaining -= n_used_symbols - i += n_used_symbols - end - typeof(kmer)(unsafe, (first(data) & get_mask(typeof(kmer)), Base.tail(data)...)) -end - @inline function unsafe_shift_from( ::TwoToFour, kmer::Kmer{<:NucleicAcidAlphabet{4}},