From d289b3083e9d9878317e2b1fb190bab2252a5583 Mon Sep 17 00:00:00 2001 From: Kristoffer Date: Fri, 12 Jul 2024 22:35:23 +0200 Subject: [PATCH] Stop special casing `InlineString1` I was looking into fixing https://github.com/JuliaStrings/InlineStrings.jl/issues/15 but realized that the special casing of `InlineString1` to only have one byte makes that not work. I would say that the current special casing of `InlineString1` creates quite a bit of confusing behavior: ``` julia> InlineString("") |> typeof String3 julia> InlineString("a") |> typeof String1 ``` Why would an empty string take more place than a one letter string? ``` julia> String1("") ERROR: ArgumentError: string too large (0) to convert to String1 Stacktrace: [1] stringtoolong(T::Type, n::Int64) @ InlineStrings ~/.julia/packages/InlineStrings/xUsry/src/InlineStrings.jl:321 [2] String1(x::String) @ InlineStrings ~/.julia/packages/InlineStrings/xUsry/src/InlineStrings.jl:208 [3] top-level scope @ REPL[4]:1 julia> String3("") "" ``` Wut? There is nothing in the documentation that indicates this type of special behavior. I'm sure there is some reason for doing this since so much pain seems to have been gone through to do it but I thought I would put up this PR nonetheless. Fixes https://github.com/JuliaStrings/InlineStrings.jl/issues/73 Fixes https://github.com/JuliaStrings/InlineStrings.jl/issues/72 --- ext/ParsersExt.jl | 13 +----- src/InlineStrings.jl | 104 ++++++++----------------------------------- test/runtests.jl | 27 ++++++----- 3 files changed, 35 insertions(+), 109 deletions(-) diff --git a/ext/ParsersExt.jl b/ext/ParsersExt.jl index 99be500..99885f0 100644 --- a/ext/ParsersExt.jl +++ b/ext/ParsersExt.jl @@ -1,6 +1,6 @@ module ParsersExt using Parsers -using InlineStrings: InlineString, InlineString1, addcodeunit +using InlineStrings: InlineString, addcodeunit Parsers.xparse(::Type{T}, buf::AbstractString, pos, len, options, ::Type{S}=T) where {T <: InlineString, S} = Parsers.xparse(T, codeunits(buf), pos, len, options, S) @@ -14,16 +14,7 @@ function Parsers.xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos x = T() else poslen = res.val - if T === InlineString1 - if poslen.len != 1 - overflowed = true - x = T() - else - Parsers.fastseek!(source, poslen.pos) - x = InlineString1(Parsers.peekbyte(source, poslen.pos)) - Parsers.fastseek!(source, pos + res.tlen - 1) - end - elseif Parsers.escapedstring(code) || !(source isa AbstractVector{UInt8}) + if Parsers.escapedstring(code) || !(source isa AbstractVector{UInt8}) if poslen.len > (sizeof(T) - 1) overflowed = true x = T() diff --git a/src/InlineStrings.jl b/src/InlineStrings.jl index 7fda5e1..c3ce32f 100644 --- a/src/InlineStrings.jl +++ b/src/InlineStrings.jl @@ -19,10 +19,10 @@ See more details by looking at individual docs for [`String1`](@ref), """ abstract type InlineString <: AbstractString end -for sz in (1, 4, 8, 16, 32, 64, 128, 256) - nm = Symbol(:String, max(1, sz - 1)) - nma = Symbol(:InlineString, max(1, sz - 1)) - macro_nm = Symbol(:inline, max(1, sz - 1), :_str) +for sz in (2, 4, 8, 16, 32, 64, 128, 256) + nm = Symbol(:String, sz-1) + nma = Symbol(:InlineString, sz-1) + macro_nm = Symbol(:inline, sz-1, :_str) at_macro_nm = Symbol("@", macro_nm) @eval begin """ @@ -30,9 +30,9 @@ for sz in (1, 4, 8, 16, 32, 64, 128, 256) $($nm)(bytes::AbstractVector{UInt8}, pos, len) $($nm)(ptr::Ptr{UInt8}, [len]) - Custom fixed-size string with a fixed size of $($sz) bytes. - 1 byte is used to store the length of the string. If an - inline string is shorter than $($(max(1, sz - 1))) bytes, the entire + Custom C compatible fixed-size string with a fixed size of $($sz) bytes. + 1 byte is used to store the capacity minus the length of the string. If an + inline string is shorter than $($(sz-1)) bytes, the entire string still occupies the full $($sz) bytes since they are, by definition, fixed size. Otherwise, they can be treated just like normal `String` values. Note that `sizeof(x)` will @@ -58,12 +58,12 @@ for sz in (1, 4, 8, 16, 32, 64, 128, 256) export $nma """ - inline$($(max(1, sz - 1)))"string" + inline$($(sz-1))"string" Macro to create a [`$($nm)`](@ref) with a fixed size of $($sz) bytes. """ macro $macro_nm(ex) - T = InlineStringType($(max(1,sz - 1))) + T = InlineStringType($(sz - 1)) T(unescape_string(ex)) end @@ -75,7 +75,7 @@ const SmallInlineStrings = Union{String1, String3, String7, String15} # used to zero out n lower bytes of an inline string clear_n_bytes(s, n) = Base.shl_int(Base.lshr_int(s, 8 * n), 8 * n) -_bswap(x::T) where {T <: InlineString} = T === InlineString1 ? x : Base.bswap_int(x) +_bswap(x::T) where {T <: InlineString} = Base.bswap_int(x) const InlineStringTypes = Union{InlineString1, InlineString3, @@ -115,26 +115,17 @@ Base.widen(::Type{InlineString63}) = InlineString127 Base.widen(::Type{InlineString127}) = InlineString255 Base.widen(::Type{InlineString255}) = String -Base.ncodeunits(::InlineString1) = 1 Base.ncodeunits(x::InlineString) = Int(Base.trunc_int(UInt8, x)) Base.codeunit(::InlineString) = UInt8 Base.@propagate_inbounds function Base.codeunit(x::T, i::Int) where {T <: InlineString} @boundscheck checkbounds(Bool, x, i) || throw(BoundsError(x, i)) - if T === InlineString1 - return Base.bitcast(UInt8, x) - else - return Base.trunc_int(UInt8, Base.lshr_int(x, 8 * (sizeof(T) - i))) - end + return Base.trunc_int(UInt8, Base.lshr_int(x, 8 * (sizeof(T) - i))) end function Base.String(x::T) where {T <: InlineString} len = ncodeunits(x) out = Base._string_n(len) - if T === InlineString1 - GC.@preserve out unsafe_store!(pointer(out), codeunit(x, 1)) - return out - end ref = Ref{T}(_bswap(x)) GC.@preserve ref out begin ptr = convert(Ptr{UInt8}, Base.unsafe_convert(Ptr{T}, ref)) @@ -149,9 +140,6 @@ function Base.Symbol(x::T) where {T <: InlineString} (Ref{T}, Int), ref, sizeof(x)) end -Base.cconvert(::Type{Ptr{UInt8}}, x::InlineString1) = Base.cconvert(Ptr{UInt8}, InlineString3(x)) -Base.cconvert(::Type{Ptr{Int8}}, x::InlineString1) = Base.cconvert(Ptr{Int8}, InlineString3(x)) -Base.cconvert(::Type{Cstring}, x::InlineString1) = Base.cconvert(Cstring, InlineString3(x)) Base.cconvert(::Type{Ptr{UInt8}}, x::T) where {T <: InlineString} = Ref{T}(_bswap(clear_n_bytes(x, 1))) Base.cconvert(::Type{Ptr{Int8}}, x::T) where {T <: InlineString} = @@ -189,9 +177,6 @@ end # add a codeunit to end of string method function addcodeunit(x::T, b::UInt8) where {T <: InlineString} - if T === InlineString1 - return Base.bitcast(InlineString1, b), false - end len = Base.trunc_int(UInt8, x) sz = Base.trunc_int(UInt8, sizeof(T)) shf = Base.zext_int(Int16, max(0x01, sz - len - 0x01)) << 3 @@ -199,37 +184,7 @@ function addcodeunit(x::T, b::UInt8) where {T <: InlineString} return Base.add_int(x, Base.zext_int(T, 0x01)), (len + 0x01) >= sz end -# from String -InlineString1(byte::UInt8=0x00) = Base.bitcast(InlineString1, byte) - -function InlineString1(x::AbstractString) - sizeof(x) == 1 || stringtoolong(InlineString1, sizeof(x)) - return Base.bitcast(InlineString1, codeunit(x, 1)) -end - -function InlineString1(buf::AbstractVector{UInt8}, pos=1, len=length(buf)) - len == 1 || stringtoolong(InlineString1, len) - return Base.bitcast(InlineString1, buf[pos]) -end - -function InlineString1(ptr::Ptr{UInt8}, len=nothing) - ptr == Ptr{UInt8}(0) && nullptr(InlineString1) - if len === nothing - y, _ = addcodeunit(InlineString1(), unsafe_load(ptr)) - unsafe_load(ptr, 2) === 0x00 || stringtoolong(InlineString1, 2) - return y - else - len == 1 || stringtoolong(InlineString1, len) - return Base.bitcast(InlineString1, unsafe_load(ptr)) - end -end - -function InlineString1(x::S) where {S <: InlineString} - sizeof(x) == 1 || stringtoolong(InlineString1, sizeof(x)) - return Base.bitcast(InlineString1, codeunit(x, 1)) -end - -for T in (:InlineString3, :InlineString7, :InlineString15, :InlineString31, :InlineString63, :InlineString127, :InlineString255) +for T in (:InlineString1, :InlineString3, :InlineString7, :InlineString15, :InlineString31, :InlineString63, :InlineString127, :InlineString255) @eval $T() = Base.zext_int($T, 0x00) @eval function $T(x::AbstractString) @@ -304,11 +259,7 @@ for T in (:InlineString3, :InlineString7, :InlineString15, :InlineString31, :Inl return Base.add_int(y, Base.zext_int($T, UInt8(len))) else # promoting smaller InlineString to larger - if S === InlineString1 - y = Base.shl_int(Base.zext_int($T, x), 8 * (sizeof($T) - sizeof(S))) - else - y = Base.shl_int(Base.zext_int($T, Base.lshr_int(x, 8)), 8 * (sizeof($T) - sizeof(S) + 1)) - end + y = Base.shl_int(Base.zext_int($T, Base.lshr_int(x, 8)), 8 * (sizeof($T) - sizeof(S) + 1)) return Base.add_int(y, Base.zext_int($T, UInt8(sizeof(x)))) end end @@ -320,7 +271,7 @@ end function InlineStringType(n::Integer) n > 255 && stringtoolong(InlineString, n) - return n == 1 ? InlineString1 : n < 4 ? InlineString3 : + return n < 2 ? InlineString1 : n < 4 ? InlineString3 : n < 8 ? InlineString7 : n < 16 ? InlineString15 : n < 32 ? InlineString31 : n < 64 ? InlineString63 : n < 128 ? InlineString127 : InlineString255 @@ -381,7 +332,6 @@ function Base.read(s::IO, ::Type{T}) where {T <: InlineString} end function Base.print(io::IO, x::T) where {T <: InlineString} - x isa InlineString1 && return print(io, Char(Base.bitcast(UInt8, x))) ref = Ref{T}(_bswap(x)) return GC.@preserve ref begin ptr = convert(Ptr{UInt8}, Base.unsafe_convert(Ptr{T}, ref)) @@ -391,9 +341,6 @@ function Base.print(io::IO, x::T) where {T <: InlineString} end function Base.isascii(x::T) where {T <: InlineString} - if T === InlineString1 - return codeunit(x, 1) < 0x80 - end len = ncodeunits(x) x = Base.lshr_int(x, 8 * (sizeof(T) - len)) for _ = 1:(len >> 2) @@ -410,7 +357,6 @@ end # "mutating" operations; care must be taken here to "clear out" # unused bits to ensure our == definition continues to work # which compares the full bit contents of inline strings -Base.chop(s::InlineString1; kw...) = chop(String3(s); kw...) function Base.chop(s::InlineString; head::Integer = 0, tail::Integer = 1) if isempty(s) return s @@ -431,7 +377,6 @@ end Base.getindex(s::InlineString, r::AbstractUnitRange{<:Integer}) = getindex(s, Int(first(r)):Int(last(r))) -Base.getindex(s::InlineString1, r::UnitRange{Int}) = getindex(InlineString3(s), r) Base.@propagate_inbounds function Base.getindex(s::InlineString, r::UnitRange{Int}) isempty(r) && return typeof(s)("") i = first(r) @@ -448,7 +393,6 @@ Base.view(s::InlineString, r::AbstractUnitRange{<:Integer}) = getindex(s, r) if isdefined(Base, :chopprefix) -Base.chopprefix(s::InlineString1, prefix::AbstractString) = chopprefix(String3(s), prefix) function Base.chopprefix(s::InlineString, prefix::AbstractString) if !isempty(prefix) && startswith(s, prefix) return _chopprefix(s, prefix) @@ -456,7 +400,6 @@ function Base.chopprefix(s::InlineString, prefix::AbstractString) return s end -Base.chopprefix(s::InlineString1, prefix::Regex) = chopprefix(String3(s), prefix) function Base.chopprefix(s::InlineString, prefix::Regex) m = match(prefix, String(s), firstindex(s), Base.PCRE.ANCHORED) m === nothing && return s @@ -483,7 +426,6 @@ end throw_strip_argument_error() = throw(ArgumentError("Both arguments are strings. The second argument should be a `Char` or collection of `Char`s")) -Base.lstrip(f, s::InlineString1) = lstrip(f, InlineString3(s)) function Base.lstrip(f, s::InlineString) nc = 0 len = 0 @@ -499,11 +441,9 @@ function Base.lstrip(f, s::InlineString) end Base.lstrip(::AbstractString, ::InlineString) = throw_strip_argument_error() -Base.lstrip(::AbstractString, ::InlineString1) = throw_strip_argument_error() if isdefined(Base, :chopsuffix) -Base.chopsuffix(s::InlineString1, suffix::AbstractString) = chopsuffix(String3(s), suffix) function Base.chopsuffix(s::InlineString, suffix::AbstractString) if !isempty(suffix) && endswith(s, suffix) return _chopsuffix(s, suffix) @@ -511,7 +451,6 @@ function Base.chopsuffix(s::InlineString, suffix::AbstractString) return s end -Base.chopsuffix(s::InlineString1, suffix::Regex) = chopsuffix(String3(s), suffix) function Base.chopsuffix(s::InlineString, suffix::Regex) m = match(suffix, String(s), firstindex(s), Base.PCRE.ENDANCHORED) m === nothing && return s @@ -529,7 +468,6 @@ _chopsuffix(s::InlineString, suffix::AbstractString) = _chopsuffix(s, ncodeunits return Base.or_int(s, _oftype(typeof(s), new_n)) end -Base.rstrip(f, s::InlineString1) = rstrip(f, InlineString3(s)) function Base.rstrip(f, s::InlineString) nc = 0 for c in Iterators.reverse(s) @@ -543,9 +481,7 @@ function Base.rstrip(f, s::InlineString) end Base.rstrip(::AbstractString, ::InlineString) = throw_strip_argument_error() -Base.rstrip(::AbstractString, ::InlineString1) = throw_strip_argument_error() -Base.chomp(s::InlineString1) = chomp(String3(s)) function Base.chomp(s::InlineString) i = lastindex(s) len = ncodeunits(s) @@ -558,14 +494,12 @@ function Base.chomp(s::InlineString) end end -Base.first(s::InlineString1, n::Integer) = first(String3(s), n) function Base.first(s::T, n::Integer) where {T <: InlineString} newlen = nextind(s, min(lastindex(s), nextind(s, 0, n))) - 1 i = sizeof(T) - newlen return Base.or_int(clear_n_bytes(s, i), _oftype(typeof(s), newlen)) end -Base.last(s::InlineString1, n::Integer) = last(String3(s), n) function Base.last(s::T, n::Integer) where {T <: InlineString} nc = ncodeunits(s) + 1 i = max(1, prevind(s, nc, n)) @@ -643,13 +577,11 @@ Base.string(a::_SmallestInlineStrings, b::_SmallestInlineStrings, c::_SmallestIn # Only benefit from keeping the small-ish strings as InlineStrings function _string(a::Ta, b::Tb) where {Ta <: SmallInlineStrings, Tb <: SmallInlineStrings} T = summed_type(Ta, Tb) - lb_a = Int(!isa(a, InlineString1)) # no "length byte" to remove if InlineString1 - lb_b = Int(!isa(b, InlineString1)) len_a = sizeof(a) len_b = sizeof(b) # Remove length byte (lshr), grow to new size (zext), move chars forward (shl). - a2 = Base.shl_int(Base.zext_int(T, Base.lshr_int(a, 8*lb_a)), 8 * (sizeof(T) - sizeof(Ta) + lb_a)) - b2 = Base.shl_int(Base.zext_int(T, Base.lshr_int(b, 8*lb_b)), 8 * (sizeof(T) - sizeof(Tb) + lb_b - len_a)) + a2 = Base.shl_int(Base.zext_int(T, Base.lshr_int(a, 8)), 8 * (sizeof(T) - sizeof(Ta) + 1)) + b2 = Base.shl_int(Base.zext_int(T, Base.lshr_int(b, 8)), 8 * (sizeof(T) - sizeof(Tb) + 1 - len_a)) lb = _oftype(T, len_a + len_b) # new length byte return Base.or_int(Base.or_int(a2, b2), lb) end @@ -879,7 +811,7 @@ using Base.Sort, Base.Order # And under certain thresholds, MergeSort is faster than RadixSort, even for small InlineStrings const MergeSortThresholds = Dict( - 1 => 2^5, + 2 => 2^5, 4 => 2^7, 8 => 2^9, 16 => 2^23 @@ -925,7 +857,7 @@ function Base.sort!(vs::AbstractVector, lo::Int, hi::Int, ::InlineStringSortAlg, # setup ts = similar(vs) - rdx = Radix(sizeof(T) == 1 ? 8 : 11) + rdx = Radix(11) radix_size = rdx.size radix_mask = rdx.mask radix_size_pow = rdx.pow diff --git a/test/runtests.jl b/test/runtests.jl index 5c11f97..e6418fd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -136,20 +136,20 @@ if isdefined(Base, :chopprefix) @test chopprefix(abc, "a") === InlineString3("bc") @test chopprefix(abc, "bc") === abc @test chopprefix(abc, "abc") === InlineString3("") -@test chopprefix(InlineString1("a"), "a") === InlineString3("") +@test chopprefix(InlineString1("a"), "a") === InlineString1("") # Regex case @test chopprefix(InlineString15("∃∃∃b∃"), r"∃+") === InlineString15("b∃") -@test chopprefix(InlineString1("a"), r".") === InlineString3("") +@test chopprefix(InlineString1("a"), r".") === InlineString1("") end if isdefined(Base, :chopsuffix) @test chopsuffix(abc, "a") === abc @test chopsuffix(abc, "bc") === InlineString3("a") @test chopsuffix(abc, "abc") === InlineString3("") -@test chopsuffix(InlineString1("c"), "c") === InlineString3("") +@test chopsuffix(InlineString1("c"), "c") === InlineString1("") # Regex case @test chopsuffix(InlineString15("∃b∃∃∃"), r"∃+") === InlineString15("∃b") -@test chopsuffix(InlineString1("c"), r".") === InlineString3("") +@test chopsuffix(InlineString1("c"), r".") === InlineString1("") end if isdefined(Base, :chopprefix) && isdefined(Base, :chopsuffix) @@ -209,7 +209,7 @@ S = InlineString7 @test rstrip(S(" a b c ")) === S(" a b c") @test rstrip(isnumeric, S("abc0123")) === S("abc") @test rstrip(S("ello"), ['e','o']) === S("ell") -@test rstrip(InlineString1("x")) === InlineString3("x") +@test rstrip(InlineString1("x")) === InlineString1("x") @test_throws ArgumentError rstrip("test", S(" a b c ")) @test_throws ArgumentError rstrip("test", InlineString1("x")) @@ -217,11 +217,11 @@ S = InlineString7 @test lstrip(S(" a b c ")) === S("a b c ") @test lstrip(isnumeric, S("0123abc")) === S("abc") @test lstrip(S("ello"), ['e','o']) === S("llo") -@test lstrip(InlineString1("x")) === InlineString3("x") +@test lstrip(InlineString1("x")) === InlineString1("x") @test_throws ArgumentError lstrip("test", S(" a b c ")) @test_throws ArgumentError lstrip("test", InlineString1("x")) -@test strip(InlineString1("x")) === InlineString3("x") +@test strip(InlineString1("x")) === InlineString1("x") S = InlineString3 @test strip(S("")) === S("") @test strip(S(" ")) === S("") @@ -238,9 +238,9 @@ for S in SUBTYPES if S == InlineString1 x = S("x") @test x[1] == 'x' - @test x[1:1] isa InlineString3 - @test x[1:1] === view(x, 1:1) === InlineString3(x) - @test x[2:1] === view(x, 2:1) === InlineString3("") + @test x[1:1] isa InlineString1 + @test x[1:1] === view(x, 1:1) === InlineString1(x) + @test x[2:1] === view(x, 2:1) === InlineString1("") @test_throws BoundsError x[2] else abc = S("abc") @@ -437,7 +437,7 @@ for (i, case) in enumerate(testcases) end res = Parsers.xparse(InlineString1, "") -@test Parsers.overflow(res.code) +@test !Parsers.overflow(res.code) res = Parsers.xparse(InlineString1, "ab") @test Parsers.overflow(res.code) res = Parsers.xparse(InlineString1, "b") @@ -482,7 +482,7 @@ end @testset "sorting tests" begin for nelems in (50, 100, 500, 1000, 5000, 100_000) for T in (String1, String3, String7, String15, String31, String63, String127, String255) - x = [randstring(rand(1:(max(1, sizeof(T) - 1)))) for _ = 1:nelems]; + x = [randstring(rand(0:(sizeof(T) - 1))) for _ = 1:nelems]; y = map(T, x); @test sort(x) == sort(y) @test sort(x; rev=true) == sort(y; rev=true) @@ -567,6 +567,9 @@ end @test repr(InlineString[inline1"a", inline15"a"]) == "InlineString[String1(\"a\"), String15(\"a\")]" end +@test inlinestrings(["a", "b", ""]) == [String1("a"), String1("b"), String1("")] +@test String1("") == "" + # only test package extension on >= 1.9.0 if VERSION >= v"1.9.0" && Sys.WORD_SIZE == 64 include(joinpath(dirname(pathof(InlineStrings)), "../ext/tests.jl"))