Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arrow support #167

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@ authors = ["Invenia Technical Computing"]
version = "1.5.0"

[deps]
ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"

[compat]
ArrowTypes = "1.2"
Documenter = "0.23, 0.24"
Infinity = "0.2.3"
RecipesBase = "0.7, 0.8, 1"
TimeZones = "0.7, 0.8, 0.9, 0.10, 0.11, 1"
julia = "1"

[extras]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
Infinity = "a303e19e-6eb4-11e9-3b09-cd9505f79100"
Expand All @@ -27,4 +30,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
VisualRegressionTests = "34922c18-7c2a-561c-bac1-01e79b2c4c92"

[targets]
test = ["Documenter", "ImageMagick", "Infinity", "Plots", "Test", "VisualRegressionTests"]
test = ["Arrow", "Documenter", "ImageMagick", "Infinity", "Plots", "Test", "VisualRegressionTests"]
2 changes: 2 additions & 0 deletions src/Intervals.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module Intervals

using ArrowTypes: ArrowTypes, JuliaType, arrowname
using Dates
using Printf
using RecipesBase
Expand Down Expand Up @@ -34,6 +35,7 @@ include("plotting.jl")
include("docstrings.jl")
include("deprecated.jl")
include("compat.jl")
include("arrow.jl")

export Bound,
Closed,
Expand Down
79 changes: 79 additions & 0 deletions src/arrow.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
for T in (Closed, Open, Unbounded)
name = QuoteNode(Symbol("JuliaLang.Intervals.$(string(T))"))

@eval begin
ArrowTypes.arrowname(::Type{$T}) = $name
ArrowTypes.JuliaType(::Val{$name}) = $T
end
end

# Use a more efficient Arrow serialization when a vector uses a concrete element type
let name = Symbol("JuliaLang.Intervals.Interval{T,L,R}")
ArrowTypes.arrowname(::Type{Interval{T,L,R}}) where {T, L <: Bound, R <: Bound} = name
function ArrowTypes.ArrowType(::Type{Interval{T,L,R}}) where {T, L <: Bound, R <: Bound}
return Interval{T,L,R}
end
function ArrowTypes.arrowmetadata(::Type{Interval{T,L,R}}) where {T, L <: Bound, R <: Bound}
return join(arrowname.([L, R]), ",")
end
function ArrowTypes.JuliaType(::Val{name}, ::Type{NamedTuple{(:first, :last), Tuple{T, T}}}, meta) where T
L, R = ArrowTypes.JuliaType.(Val.(Symbol.(split(meta, ","))))
return Interval{T,L,R}
end
function ArrowTypes.fromarrow(::Type{Interval{T,L,R}}, left, right) where {T, L <: Bound, R <: Bound}
return Interval{T,L,R}(
L === Unbounded ? nothing : left,
R === Unbounded ? nothing : right,
)
end
end

# A less efficient Arrow serialization format for when a vector contains non-concrete element types
let name = Symbol("JuliaLang.Intervals.Interval{T}")
ArrowTypes.arrowname(::Type{<:Interval{T}}) where T = name
function ArrowTypes.ArrowType(::Type{<:Interval{T}}) where T
return NamedTuple{(:left, :right), Tuple{Tuple{String, T}, Tuple{String, T}}}
end
function ArrowTypes.toarrow(x::Interval{T}) where T
L, R = bounds_types(x)
return (; left=(string(arrowname(L)), x.first), right=(string(arrowname(R)), x.last))
end
function ArrowTypes.JuliaType(::Val{name}, ::Type{NamedTuple{names, types}}) where {names, types}
T = fieldtype(fieldtype(types, 1), 2)
return Interval{T}
end
function ArrowTypes.fromarrow(::Type{Interval{T}}, left, right) where T
L = ArrowTypes.JuliaType(Val(Symbol(left[1])))
R = ArrowTypes.JuliaType(Val(Symbol(right[1])))
return Interval{T,L,R}(
L === Unbounded ? nothing : left[2],
R === Unbounded ? nothing : right[2],
)
end
end

# Note: The type returnedy by the `ArrowType` function is not passed into the `JuliaType`
# function. Instead the result of `typeof(toarrow(...))` is passed into `JuliaType`.
# To reproduce this use an isbits object as a type parameter in `ArrowType`.

# An inefficient Arrow serialization format which supports non-concrete element types
let name = Symbol("JuliaLang.Intervals.AnchoredInterval{P,T}")
ArrowTypes.arrowname(::Type{<:AnchoredInterval{P,T}}) where {P,T} = name
function ArrowTypes.ArrowType(::Type{<:AnchoredInterval{P,T}}) where {P,T}
return NamedTuple{(:anchor,), Tuple{Tuple{typeof(P), T, String, String}}}
end
function ArrowTypes.toarrow(x::AnchoredInterval{P,T}) where {P,T}
L, R = bounds_types(x)
return (; anchor=(P, x.anchor, string(arrowname(L)), string(arrowname(R))))
end
function ArrowTypes.JuliaType(::Val{name})
return AnchoredInterval
end
function ArrowTypes.fromarrow(::Type{AnchoredInterval}, anchor)
P = anchor[1]
T = typeof(anchor[2]) # Note: Arrow can't access the original `T` anyway
L = ArrowTypes.JuliaType(Val(Symbol(anchor[3])))
R = ArrowTypes.JuliaType(Val(Symbol(anchor[4])))
return AnchoredInterval{P,T,L,R}(anchor[2])
end
end
38 changes: 38 additions & 0 deletions test/arrow.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@testset "Arrow support" begin
@testset "Interval (concrete)" begin
col = [Interval{Closed,Unbounded}(1, nothing)]

table = (; col)
t = Arrow.Table(Arrow.tobuffer(table))

@test eltype(t.col) == Interval{Int, Closed, Unbounded}
@test t.col == col
end

@testset "Interval (non-concrete)" begin
col = [
Interval{Closed, Closed}(1, 2),
Interval{Closed, Open}(2, 3),
Interval{Unbounded, Open}(nothing, 4),
]

table = (; col)
t = Arrow.Table(Arrow.tobuffer(table))

@test eltype(t.col) == Interval{Int}
@test t.col == col
end

@testset "AnchoredInterval" begin
zdt_start = ZonedDateTime(2016, 8, 11, 1, tz"America/Winnipeg")
zdt_end = ZonedDateTime(2016, 8, 12, 0, tz"America/Winnipeg")
col = HE.(zdt_start:Hour(1):zdt_end)

table = (; col)
t = Arrow.Table(Arrow.tobuffer(table))

# Arrow.jl converts all Period types into Second
@test_broken eltype(t.col) == HourEnding{ZonedDateTime, Open, Closed}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll probably end up disabling AnchoredInterval support to start with because of this

Copy link
Member

@rofinn rofinn Jul 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't explicitly override this behaviour? Given how much data we have saved in HE format needing to save hour data as Intervals would be a significant performance hit and likely wouldn't make this PR worth it for us. Also, doesn't Arrow have it's own interval type which does the same thing?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've forgotten a bunch about this PR but it should be possible to encode the period span in the serialized Arrow form. I do remember I was mainly focused on Interval support so I think this comment was more about that the implementation currently in place is incomplete.

@test t.col == col
end
end
8 changes: 8 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
VERSION >= v"1.3" && using Arrow

using Base.Iterators: product
using Dates
using Documenter: doctest
Expand All @@ -20,6 +22,12 @@ include("test_utils.jl")
include("comparisons.jl")
include("plotting.jl")

if VERSION >= v"1.3"
include("arrow.jl")
else
@warn "Skipping Arrow.jl support tests"
end

# Note: The output of the doctests currently requires a newer version of Julia
# https://github.com/JuliaLang/julia/pull/34387
# The doctests fail on x86, so only run them on 64-bit hardware
Expand Down