From 8048f642c71f3012a6038b80a932ffeb0c586840 Mon Sep 17 00:00:00 2001 From: Valentin Kaisermayer Date: Wed, 17 Jul 2024 14:18:02 +0200 Subject: [PATCH 1/6] adds method for splitting data into periods minor return iterator and adds test starts to add retime changes _split changes LTS overload Base.split changes interface of _split to return timestamps --- .github/workflows/CI.yml | 3 +-- docs/src/split.md | 11 +++++++++ src/TimeSeries.jl | 1 + src/retime.jl | 48 ++++++++++++++++++++++++++++++++++++++++ src/split.jl | 33 +++++++++++++++++++++++++++ test/split.jl | 17 ++++++++++++-- 6 files changed, 109 insertions(+), 4 deletions(-) create mode 100644 src/retime.jl diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 682f4e0d..748d58f0 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -15,8 +15,7 @@ jobs: fail-fast: false matrix: version: - - "1.6.7" # LTS - - "1.6" + - "1.10.7" # LTS - "1" # Latest Release os: - ubuntu-latest diff --git a/docs/src/split.md b/docs/src/split.md index 6b474a77..fbced9bb 100644 --- a/docs/src/split.md +++ b/docs/src/split.md @@ -124,3 +124,14 @@ using MarketData tail(cl) tail(cl, 3) ``` + +## Splitting by period + +Splitting data by a given function, e.g. `Dates.day` into periods. + +```@repl +using TimeSeries +using MarketData + +split(cl, Dates.day) +``` \ No newline at end of file diff --git a/src/TimeSeries.jl b/src/TimeSeries.jl index 3d03dee2..280b4746 100644 --- a/src/TimeSeries.jl +++ b/src/TimeSeries.jl @@ -58,5 +58,6 @@ include("modify.jl") include("basemisc.jl") include("deprecated.jl") include("plotrecipes.jl") +include("retime.jl") end # module TimeSeries diff --git a/src/retime.jl b/src/retime.jl new file mode 100644 index 00000000..ad11c0be --- /dev/null +++ b/src/retime.jl @@ -0,0 +1,48 @@ +function retime(ta, new_dt::Dates.Period; kwargs...) + new_timestamps = timestamp(ta)[1]:new_dt:timestamp(ta)[end] + return retime(ta, new_timestamps; kwargs...) +end + +function retime(ta, period::Function; kwargs...) + new_timestamps = map(i -> first(timestamp(ta)[i]), _split(timestamp(ta), period)) + return retime(ta, new_timestamps; kwargs...) +end + +function retime( + ta::TimeSeries{T,N,D,A}, + new_timestamps::AbstractVector{DN}; + upsample=:previous, + downsample::Union{Symbol,Function}=:mean, + extrapolate::Bool=true, +) where {T,N,D,A,DN} + new_values = zeros(T, length(new_timestamps), size(values(ta), 2)) + old_timestamps = convert(Vector{DN}, timestamp(ta)) + old_values = values(ta) + @views begin + for col_i in 1:size(old_values, 2) + _retime!(new_values[:, col_i], old_timestamps, old_values[:, col_i], new_timestamps, upsample, downsample, extrapolate) + end + end + return TimeArray(new_timestamps, new_values, colnames(ta), meta(ta)) +end + +function _retime!( + new_values::AbstractVector{A}, + old_timestamps::AbstractVector{D}, + old_values::AbstractVector{A}, + new_timestamps::AbstractVector{D}, + upsample, + downsample, + extrapolate, +) where {D,A} + + x = Dates.value.(old_timestamps) + x_min, x_max = extrema(x) + x_new = Dates.value.(new_timestamps) + + # check each interval between i and i+1 if there is no or one sample (upsample), more than one sample (downsample) + for i in eachindex(x_new) + end + return +end + diff --git a/src/split.jl b/src/split.jl index 55054127..e7984b96 100644 --- a/src/split.jl +++ b/src/split.jl @@ -70,3 +70,36 @@ end Base.first(ta::TimeArray) = head(ta, 1) Base.last(ta::TimeArray) = tail(ta, 1) + +""" + split(data::TimeSeries.TimeArray, period::Function) + +Split `data` by `period` function, returns a vector of `TimeSeries.TimeArray`. + +## Arguments + +- `data::TimeSeries.TimeArray`: Data to split +- `period::Function`: Function, e.g. `Dates.day` that is used to split the `data`. +""" +Base.split(data::TimeSeries.TimeArray, period::Function) = + Iterators.map(i -> data[i], _split(TimeSeries.timestamp(data), period)) + +function _split(ts::AbstractVector{D}, period::Function) where {D<:TimeType} + m = length(ts) + idx = UnitRange{Int}[] + isempty(ts) && return idx + + sizehint!(idx, m) + t0 = period(ts[1]) + j = 1 + for i in 1:(m - 1) + t1 = period(ts[i + 1]) + t0 == t1 && continue + push!(idx, j:i) + j = i + 1 + t0 = t1 + end + push!(idx, j:m) + + return Iterators.map(i -> ts[i], idx) +end \ No newline at end of file diff --git a/test/split.jl b/test/split.jl index f434e1d9..18eb0165 100644 --- a/test/split.jl +++ b/test/split.jl @@ -1,8 +1,6 @@ using Dates using Test - using MarketData - using TimeSeries @testset "split" begin @@ -123,4 +121,19 @@ using TimeSeries @test length(last(ohlc)) == 1 end end + + @testset "split period" begin + for period in [day, week, month, year] + for cl_ in split(cl, period) + @test allequal(period.(timestamp(cl_))) + end + end + @test length(split(cl, day)) == 500 + @test length(split(cl, week)) == 105 + @test length(split(cl, month)) == 24 + @test length(split(cl, year)) == 2 + + # test empty timearray + @test length(split(to(cl, Date(2000)), week)) == 0 + end end # @testset "split" From 48d72ab2d3e4a915367f05cf33f4ec7ef79408bb Mon Sep 17 00:00:00 2001 From: Valentin Kaisermayer Date: Sat, 14 Dec 2024 22:50:21 +0100 Subject: [PATCH 2/6] adds retime adds missing deps another missing dep support symbols as well handle missing data adds docs on retime minor --- .gitignore | 1 + Project.toml | 11 --- docs/make.jl | 1 + docs/src/index.md | 1 + docs/src/plotting.md | 6 +- docs/src/retime.md | 90 ++++++++++++++++++ src/TimeSeries.jl | 19 +++- src/retime.jl | 214 +++++++++++++++++++++++++++++++++++++++---- test/Project.toml | 10 ++ test/retime.jl | 205 +++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 1 + 11 files changed, 527 insertions(+), 32 deletions(-) create mode 100644 docs/src/retime.md create mode 100644 test/Project.toml create mode 100644 test/retime.jl diff --git a/.gitignore b/.gitignore index 827b2130..f10e505d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .swp docs/build/ Manifest.toml +test/Manifest.toml diff --git a/Project.toml b/Project.toml index 9c2f0e14..107ca2a3 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,6 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] DelimitedFiles = "1" DocStringExtensions = "0.8, 0.9" -MarketData = "0.14" RecipesBase = "0.5, 0.7, 0.8, 1.0" Reexport = "1" Statistics = "1" @@ -27,13 +26,3 @@ julia = "1.6" PrettyTables = "2" IteratorInterfaceExtensions = "1" TableTraits = "1" - -[extras] -CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -MarketData = "945b72a4-3b13-509d-9b46-1525bb5c06de" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[targets] -test = ["CSV", "DataFrames", "MarketData", "Random", "Test"] diff --git a/docs/make.jl b/docs/make.jl index 96d01ecd..b5e09293 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,6 +20,7 @@ makedocs(; "tables.md", "dotfile.md", "plotting.md", + "retime.md", ], ) diff --git a/docs/src/index.md b/docs/src/index.md index 4bf21b0d..dd535f0c 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -19,5 +19,6 @@ Pages = [ "tables.md", "dotfile.md", "plotting.md", + "retime.md", ] ``` diff --git a/docs/src/plotting.md b/docs/src/plotting.md index 2f97e37f..cb1665d5 100644 --- a/docs/src/plotting.md +++ b/docs/src/plotting.md @@ -17,10 +17,8 @@ ta = yahoo(:GOOG, YahooOpt(; period1=now() - Month(1))) ## Plotting as multiple series -The recipe allows `TimeArray` objects to be passed as input to `plot`. The -recipe will plot each variable as an individual line, aligning all -variables to the same y axis. -backend). +The recipe allows `TimeArray` objects to be passed as input to `plot`. +The recipe will plot each variable as an individual line, aligning all variables to the same y axis. ```@example plot plot(ta[:Open, :High, :Low, :Close]) diff --git a/docs/src/retime.md b/docs/src/retime.md new file mode 100644 index 00000000..4160db86 --- /dev/null +++ b/docs/src/retime.md @@ -0,0 +1,90 @@ +# Retime + +The `retime` function allows you to retime, i.e. change the timestamps of a `TimeArray`, similar to what [Matlab's retime](https://www.mathworks.com/help/matlab/ref/timetable.retime.html) does. + +```@example retime +using Plots, Dates, TimeSeries +default(show = false) # hide +ENV["GKSwstype"] = "100" # hide +gr() +timestamps = range(DateTime(2020, 1, 1), length = 7*24, step = Hour(1)) +ta = TimeArray(timestamps, cumsum(randn(7*24)), [:a]) +``` + +## Using a new time step +```@example retime +retime(ta, Minute(15)) +``` + +## Using new timestep vector +```@example retime +new_timestamps = range(DateTime(2020, 1, 1), DateTime(2020, 1, 2), step = Minute(15)) +retime(ta, new_timestamps) +``` + +## Irregular timestamps +You can perform retime on irregularly spaced timestamps, both using a `TimeArray` with irregular timestamps or using a vector of irregular timestamps. Depending on the timestamps `upsampling` or `downsampling` is used. +```@example retime +new_timestamps = vcat( + range(DateTime(2020, 1, 1), DateTime(2020, 1, 2)-Minute(15), step = Minute(15)), + range(DateTime(2020, 1, 2), DateTime(2020, 1, 3), step = Hour(1)), +) +retime(ta, new_timestamps) +``` + +## Upsampling + +Interpolation is done using the `upsample` argument. If no data is directly hit, the specified `upsample` method is used. Available `upsample` methods are: +- `Linear()` or `:linear` +- `Nearest()` or `:nearest` +- `Previous()` or `:previous` +- `Next()` or `:next` + +```@example retime +ta_ = retime(ta, Minute(15), upsample=Linear()) +``` + +```@example retime +plot(ta) +plot!(ta_) +savefig("retime-upsampling.svg"); nothing # hide +``` +![](retime-upsampling.svg) + +## Downsampling + +Downsampling or aggregation is done using the `downsample` argument. This applies a function to each interval not including the right-edge of the interval. If no data is present in the interval the specified `upsample` method is used. +Available `downsample` methods are: +- `Mean()` or `:mean` +- `Min()` or `:min` +- `Max()` or `:max` +- `Count()` or `:count` +- `Sum()` or `:sum` +- `Median()` or `:median` +- `First()` or `:first` +- `Last()` or `:last` + +```@example retime +ta_ = retime(ta, Hour(6), downsample=Mean()) +``` + +```@example retime +plot(ta) +plot!(ta_) +savefig("retime-downsample.svg"); nothing # hide +``` +![](retime-downsample.svg) + +## Extrapolation + +Extrapolation at the beginning and end of the time series is done using the `extrapolate` argument. +Available `extrapolate` methods are: +- `FillConstant(value)` or `:fillconstant` +- `NearestExtrapolate()` or `:nearest` +- `MissingExtrapolate()` or `:missing` +- `NaNExtrapolate()` or `:nan` + +```@example retime +new_timestamps = range(DateTime(2019, 12, 31), DateTime(2020, 1, 2), step = Minute(15)) +ta_ = retime(ta, new_timestamps, extrapolate=MissingExtrapolate()) +``` \ No newline at end of file diff --git a/src/TimeSeries.jl b/src/TimeSeries.jl index 280b4746..e4f911bf 100644 --- a/src/TimeSeries.jl +++ b/src/TimeSeries.jl @@ -36,7 +36,24 @@ export TimeArray, merge, collapse, readtimearray, - writetimearray + writetimearray, + retime, + Linear, + Previous, + Next, + Nearest, + Mean, + Min, + Max, + Count, + Sum, + Median, + First, + Last, + FillConstant, + NearestExtrapolate, + MissingExtrapolate, + NaNExtrapolate # modify.jl export rename, rename! diff --git a/src/retime.jl b/src/retime.jl index ad11c0be..d1531a17 100644 --- a/src/retime.jl +++ b/src/retime.jl @@ -1,48 +1,230 @@ +# Abstract types for interpolation, aggregation, and extrapolation methods +abstract type InterpolationMethod end +abstract type AggregationMethod end +abstract type ExtrapolationMethod end + +# Interpolation methods +struct Linear <: InterpolationMethod end +struct Previous <: InterpolationMethod end +struct Next <: InterpolationMethod end +struct Nearest <: InterpolationMethod end + +# Aggregation methods +struct Mean <: AggregationMethod end +struct Min <: AggregationMethod end +struct Max <: AggregationMethod end +struct Count <: AggregationMethod end +struct Sum <: AggregationMethod end +struct Median <: AggregationMethod end +struct First <: AggregationMethod end +struct Last <: AggregationMethod end + +# Extrapolation methods +struct FillConstant{V} <: ExtrapolationMethod + value::V +end +struct NearestExtrapolate <: ExtrapolationMethod end +struct MissingExtrapolate <: ExtrapolationMethod end +struct NaNExtrapolate <: ExtrapolationMethod end + +_toInterpolationMethod(x::Symbol) = _toInterpolationMethod(Val(x)) +_toInterpolationMethod(::Val{:linear}) = Linear() +_toInterpolationMethod(::Val{:previous}) = Previous() +_toInterpolationMethod(::Val{:next}) = Next() +_toInterpolationMethod(::Val{:nearest}) = Nearest() +_toInterpolationMethod(x::InterpolationMethod) = x + +_toAggregationMethod(x::Symbol) = _toAggregationMethod(Val(x)) +_toAggregationMethod(::Val{:mean}) = Mean() +_toAggregationMethod(::Val{:min}) = Min() +_toAggregationMethod(::Val{:max}) = Max() +_toAggregationMethod(::Val{:count}) = Count() +_toAggregationMethod(::Val{:sum}) = Sum() +_toAggregationMethod(::Val{:median}) = Median() +_toAggregationMethod(::Val{:first}) = First() +_toAggregationMethod(::Val{:last}) = Last() +_toAggregationMethod(x::AggregationMethod) = x + +_toExtrapolationMethod(x::Symbol) = _toExtrapolationMethod(Val(x)) +_toExtrapolationMethod(::Val{:fillconstant}) = FillConstant(0.0) +_toExtrapolationMethod(::Val{:nearest}) = NearestExtrapolate() +_toExtrapolationMethod(::Val{:missing}) = MissingExtrapolate() +_toExtrapolationMethod(::Val{:nan}) = NaNExtrapolate() +_toExtrapolationMethod(x::ExtrapolationMethod) = x + function retime(ta, new_dt::Dates.Period; kwargs...) new_timestamps = timestamp(ta)[1]:new_dt:timestamp(ta)[end] return retime(ta, new_timestamps; kwargs...) end function retime(ta, period::Function; kwargs...) - new_timestamps = map(i -> first(timestamp(ta)[i]), _split(timestamp(ta), period)) + new_timestamps = map(i -> first(i), _split(timestamp(ta), period)) return retime(ta, new_timestamps; kwargs...) end function retime( - ta::TimeSeries{T,N,D,A}, + ta::TimeArray{T,N,D,A}, new_timestamps::AbstractVector{DN}; - upsample=:previous, - downsample::Union{Symbol,Function}=:mean, - extrapolate::Bool=true, + upsample::Union{Symbol,InterpolationMethod}=Previous(), + downsample::Union{Symbol,AggregationMethod}=Mean(), + extrapolate::Union{Symbol,ExtrapolationMethod}=NearestExtrapolate(), + skip_missing::Bool=true, ) where {T,N,D,A,DN} - new_values = zeros(T, length(new_timestamps), size(values(ta), 2)) + upsample = _toInterpolationMethod(upsample) + downsample = _toAggregationMethod(downsample) + extrapolate = _toExtrapolationMethod(extrapolate) + + new_values = __get_new_values(T, length(new_timestamps), size(values(ta), 2), extrapolate, skip_missing) old_timestamps = convert(Vector{DN}, timestamp(ta)) old_values = values(ta) @views begin for col_i in 1:size(old_values, 2) - _retime!(new_values[:, col_i], old_timestamps, old_values[:, col_i], new_timestamps, upsample, downsample, extrapolate) + if skip_missing + idx = findall(x -> !ismissing(x) && !isnan(x), old_values[:, col_i]) + else + idx = ones(Int, length(old_timestamps)) + end + + _retime!( + new_values[:, col_i], + old_timestamps[idx], + old_values[idx, col_i], + new_timestamps, + upsample, + downsample, + extrapolate, + skip_missing, + ) end end return TimeArray(new_timestamps, new_values, colnames(ta), meta(ta)) end function _retime!( - new_values::AbstractVector{A}, + new_values::AbstractVector{AN}, old_timestamps::AbstractVector{D}, old_values::AbstractVector{A}, new_timestamps::AbstractVector{D}, - upsample, - downsample, - extrapolate, -) where {D,A} - + upsample::InterpolationMethod, + downsample::AggregationMethod, + extrapolate::ExtrapolationMethod, + skip_missing::Bool, +) where {D,AN,A} x = Dates.value.(old_timestamps) x_min, x_max = extrema(x) x_new = Dates.value.(new_timestamps) - # check each interval between i and i+1 if there is no or one sample (upsample), more than one sample (downsample) - for i in eachindex(x_new) + N = length(x_new) + + @views begin + # check each interval between i and i+1 if there is no or one sample (upsample), more than one sample (downsample) + for i in 1:N + if x_new[i] < x_min || x_new[i] > x_max + # Handle extrapolation + new_values[i] = _extrapolate(extrapolate, x_new[i], x, old_values) + else + idx = if i < N + _get_idx(x, x_new[i], x_new[i+1]) + else + # assume that the last interval is the same length as the second to last one + _get_idx(x, x_new[i], x_new[i] + (x_new[i] - x_new[i-1])) + end + + if isempty(idx) + # No original samples lie between x_new[i] and x_new[i+1] --> Upsampling + new_values[i] = _upsample(upsample, x, old_values, x_new[i]) + elseif length(idx) == 1 + if x_new[i] == x[idx[1]] # directly hit the sample, do not try the upsample method + new_values[i] = old_values[idx[1]] + else + # Only one sample found in the interval x_new[i] and x_new[i+1] --> use the upsample method + new_values[i] = _upsample(upsample, x, old_values, x_new[i]) + end + else + # Multiple samples were found in the interval [x_new[i], x_new[i+1]) --> use the downsample method to get the agglomeration + new_values[i] = _downsample(downsample, old_values[idx]) + end + end + end + end + return nothing +end + +function __get_new_values(T, N, n, extrapolate, skip_missing) + return zeros(skip_missing ? nonmissingtype(T) : T, N, n) +end +function __get_new_values(T, N, n, extrapolate::MissingExtrapolate, skip_missing) + return zeros(Union{Missing,T}, N, n) +end + +function _get_idx(x::AbstractVector{<:Real}, x_left::Real, x_right::Real) + idx_left = searchsortedfirst(x, x_left) # greater or equal to x_left + idx_right = searchsortedlast(x, prevfloat(Float64(x_right))) # less to x_right + return idx_left:idx_right +end + +# Extrapolation dispatch +function _extrapolate(m::FillConstant, t_new, x, y) + return m.value +end + +function _extrapolate(::NearestExtrapolate, t_new, x, y) + idx = argmin(abs.(x .- t_new)) + return y[idx] +end + +function _extrapolate(::MissingExtrapolate, t_new, x, y) + return missing +end + +function _extrapolate(::NaNExtrapolate, t_new, x, y) + return NaN +end + +# Interpolation dispatch +function _upsample(::Linear, x_old, old_values, x) + idx_next = searchsortedfirst(x_old, x) # greater or equal to x + idx_prev = searchsortedlast(x_old, x) # less or equal to x + return y = if idx_prev == idx_next # avoid division by zero + old_values[idx_prev] + else + old_values[idx_prev] + + (x - x_old[idx_prev]) * (old_values[idx_next] - old_values[idx_prev]) / (x_old[idx_next] - x_old[idx_prev]) + end +end + +function _upsample(::Previous, x_old, old_values, x) + idx_prev = searchsortedlast(x_old, x) # less or equal to x + return old_values[idx_prev] +end + +function _upsample(::Next, x_old, old_values, x) + idx_next = searchsortedfirst(x_old, x) # greater or equal to x + return old_values[idx_next] +end + +function _upsample(::Nearest, x_old, old_values, x) + idx_next = searchsortedfirst(x_old, x) # greater or equal to x + idx_prev = searchsortedlast(x_old, x) # less or equal to x + y = if idx_prev == idx_next # avoid division by zero + old_values[idx_prev] + else + pos = (x - x_old[idx_prev]) / (x_old[idx_next] - x_old[idx_prev]) + if pos < 0.5 + old_values[idx_prev] + else + old_values[idx_next] + end end - return + return y end +# Aggregation dispatch +_downsample(::Mean, values_in_range) = mean(values_in_range) +_downsample(::Min, values_in_range) = minimum(values_in_range) +_downsample(::Max, values_in_range) = maximum(values_in_range) +_downsample(::Count, values_in_range) = count(!ismissing, values_in_range) +_downsample(::Sum, values_in_range) = sum(values_in_range) +_downsample(::Median, values_in_range) = median(values_in_range) +_downsample(::First, values_in_range) = first(values_in_range) +_downsample(::Last, values_in_range) = last(values_in_range) \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 00000000..c05da130 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,10 @@ +[deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +MarketData = "945b72a4-3b13-509d-9b46-1525bb5c06de" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +TimeSeries = "9e3dc215-6440-5c97-bce1-76c03772f85e" diff --git a/test/retime.jl b/test/retime.jl new file mode 100644 index 00000000..d540c9b5 --- /dev/null +++ b/test/retime.jl @@ -0,0 +1,205 @@ +using Test +using MarketData +using TimeSeries +using Dates +using Statistics + +@testset "retime" begin + + @testset "interpolation" begin + @test TimeSeries._toInterpolationMethod(:linear) == TimeSeries.Linear() + @test TimeSeries._toInterpolationMethod(:nearest) == TimeSeries.Nearest() + @test TimeSeries._toInterpolationMethod(:previous) == TimeSeries.Previous() + @test TimeSeries._toInterpolationMethod(:next) == TimeSeries.Next() + + @test_throws MethodError TimeSeries._toInterpolationMethod(:foo) + end + + @testset "aggregation" begin + @test TimeSeries._toAggregationMethod(:mean) == TimeSeries.Mean() + @test TimeSeries._toAggregationMethod(:min) == TimeSeries.Min() + @test TimeSeries._toAggregationMethod(:max) == TimeSeries.Max() + @test TimeSeries._toAggregationMethod(:count) == TimeSeries.Count() + @test TimeSeries._toAggregationMethod(:sum) == TimeSeries.Sum() + @test TimeSeries._toAggregationMethod(:median) == TimeSeries.Median() + @test TimeSeries._toAggregationMethod(:first) == TimeSeries.First() + @test TimeSeries._toAggregationMethod(:last) == TimeSeries.Last() + + @test_throws MethodError TimeSeries._toAggregationMethod(:foo) + end + + @testset "extrapolation" begin + @test TimeSeries._toExtrapolationMethod(:fillconstant) == TimeSeries.FillConstant(0.0) + @test TimeSeries._toExtrapolationMethod(:nearest) == TimeSeries.NearestExtrapolate() + @test TimeSeries._toExtrapolationMethod(:missing) == TimeSeries.MissingExtrapolate() + @test TimeSeries._toExtrapolationMethod(:nan) == TimeSeries.NaNExtrapolate() + + @test_throws MethodError TimeSeries._toExtrapolationMethod(:foo) + end + + @testset "single column" begin + new_timestamps = collect(Dates.Date(2000):Dates.Week(1):Dates.Date(2001)) + + funcs = [mean, sum, minimum, maximum, last] + downsamples = [TimeSeries.Mean(), TimeSeries.Sum(), TimeSeries.Min(), TimeSeries.Max(), TimeSeries.Last()] + @testset for (func, downsample) in zip(funcs, downsamples) + cl_new = retime(cl, new_timestamps; upsample=TimeSeries.Linear(), downsample) + + @test timestamp(cl_new) == new_timestamps + + # extrapolation + @test values(cl_new[1, :Close]) == values(cl[1, :Close]) + + # aggregation + idx = new_timestamps[2] .<= timestamp(cl) .< new_timestamps[3] + @test func(values(cl[:Close][idx])) == values(cl_new[:Close][2])[1] + end + + # test using Symbols + downsamples = [:mean, :sum, :min, :max, :last] + @testset for (func, downsample) in zip(funcs, downsamples) + cl_new = retime(cl, new_timestamps; upsample=TimeSeries.Linear(), downsample) + + @test timestamp(cl_new) == new_timestamps + + # extrapolation + @test values(cl_new[1, :Close]) == values(cl[1, :Close]) + + # aggregation + idx = new_timestamps[2] .<= timestamp(cl) .< new_timestamps[3] + @test func(values(cl[:Close][idx])) == values(cl_new[:Close][2])[1] + end + end + + @testset "single column interpolation" begin + new_timestamps = collect(Dates.DateTime(2000):Dates.Hour(1):Dates.DateTime(2001)) + + upsamples = [TimeSeries.Linear(), TimeSeries.Previous(), TimeSeries.Next(), TimeSeries.Nearest()] + @testset for upsample in upsamples + cl_new = retime(cl, new_timestamps; upsample) + + @test timestamp(cl_new) == new_timestamps + + # TODO: real tests + end + + # test using Symbols + upsamples = [:linear, :previous, :next, :nearest] + @testset for upsample in upsamples + cl_new = retime(cl, new_timestamps; upsample) + + @test timestamp(cl_new) == new_timestamps + + # TODO: real tests + end + end + + @testset "single column extrapolate" begin + new_timestamps = collect(Dates.DateTime(2000):Dates.Hour(1):Dates.DateTime(2001)) + + cl_new = retime(cl, new_timestamps; extrapolate=TimeSeries.FillConstant(0.0)) + @test timestamp(cl_new) == new_timestamps + @test values(cl_new[:Close][1])[1] == 0.0 + + cl_new = retime(cl, new_timestamps; extrapolate=TimeSeries.NearestExtrapolate()) + @test timestamp(cl_new) == new_timestamps + @test values(cl_new[:Close][1])[1] == values(cl[:Close][1])[1] + + cl_new = retime(cl, new_timestamps; extrapolate=TimeSeries.MissingExtrapolate()) + @test timestamp(cl_new) == new_timestamps + @test all(ismissing.(values(cl_new[:Close][1]))) + + cl_new = retime(cl, new_timestamps; extrapolate=TimeSeries.NaNExtrapolate()) + @test timestamp(cl_new) == new_timestamps + @test all(isnan.(values(cl_new[:Close][1]))) + end + + @testset "multi column" begin + new_timestamps = collect(Dates.Date(2000):Dates.Week(1):Dates.Date(2001)) + + funcs = [mean, sum, minimum, maximum, last] + downsamples = [TimeSeries.Mean(), TimeSeries.Sum(), TimeSeries.Min(), TimeSeries.Max(), TimeSeries.Last()] + @testset for (func, downsample) in zip(funcs, downsamples) + + ohlc_new = retime(ohlc, new_timestamps; upsample=TimeSeries.Linear(), downsample=TimeSeries.Mean()) + + @test timestamp(ohlc_new) == new_timestamps + + # extrapolation + @test values(ohlc_new[1]) == values(ohlc_new[1]) + + idx = new_timestamps[2] .<= timestamp(ohlc) .< new_timestamps[3] + @test mean(values(ohlc[idx]); dims=1) == values(ohlc_new[2]) + end + end + + @testset "multi column interpolation" begin + new_timestamps = collect(Dates.DateTime(2000):Dates.Hour(1):Dates.DateTime(2001)) + + upsamples = [TimeSeries.Linear(), TimeSeries.Previous(), TimeSeries.Next(), TimeSeries.Nearest()] + @testset for upsample in upsamples + ohlc_new = retime(ohlc, new_timestamps; upsample) + + @test timestamp(ohlc_new) == new_timestamps + + # TODO: real tests + end + end + + @testset "multi column extrapolate" begin + new_timestamps = collect(Dates.DateTime(2000):Dates.Hour(1):Dates.DateTime(2001)) + + ohlc_new = retime(ohlc, new_timestamps; extrapolate=TimeSeries.FillConstant(0.0)) + @test timestamp(ohlc_new) == new_timestamps + @test values(ohlc_new[1]) == zeros(1, 4) + + ohlc_new = retime(ohlc, new_timestamps; extrapolate=TimeSeries.NearestExtrapolate()) + @test timestamp(ohlc_new) == new_timestamps + @test values(ohlc_new[1]) == values(ohlc[1]) + + ohlc_new = retime(ohlc, new_timestamps; extrapolate=TimeSeries.MissingExtrapolate()) + @test timestamp(ohlc_new) == new_timestamps + @test all(ismissing.(values(ohlc_new[1]))) + + ohlc_new = retime(ohlc, new_timestamps; extrapolate=TimeSeries.NaNExtrapolate()) + @test timestamp(ohlc_new) == new_timestamps + @test all(isnan.(values(ohlc_new[1]))) + end + + @testset "single column with missing" begin + new_timestamps = collect(Dates.Date(2000):Dates.Week(1):Dates.Date(2001)) + # corrupt some values + cl_missing = TimeArray( + timestamp(cl), + let vals = convert(Vector{Union{Float64,Missing}}, copy(values(cl))) + vals[rand(1:length(vals), 100)] .= missing + vals + end, + colnames(cl), + ) + + cl_new = retime(cl_missing, new_timestamps; upsample=:linear, downsample=:mean, skip_missing=false) + + cl_new = retime(cl_missing, new_timestamps; upsample=:linear, downsample=:mean, skip_missing=true) + @test !any(ismissing.(values(cl_new))) + end + + @testset "single column with NaN" begin + new_timestamps = collect(Dates.Date(2000):Dates.Week(1):Dates.Date(2001)) + # corrupt some values + cl_missing = TimeArray( + timestamp(cl), + let vals = copy(values(cl)) + vals[rand(1:length(vals), 100)] .= NaN + vals + end, + colnames(cl), + ) + + cl_new = retime(cl_missing, new_timestamps; upsample=:linear, downsample=:mean, skip_missing=false) + + cl_new = retime(cl_missing, new_timestamps; upsample=:linear, downsample=:mean, skip_missing=true) + @test !any(isnan.(values(cl_new))) + end + +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 85513609..8b226ff3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,6 +13,7 @@ tests = [ "basemisc", "tables", "plotrecipes", + "retime", ] @testset "TimeSeries" begin From d530d22ec43eb57cfb97e8393ed9327a65dfc769 Mon Sep 17 00:00:00 2001 From: Valentin Kaisermayer Date: Mon, 20 Jan 2025 19:16:40 +0100 Subject: [PATCH 3/6] formatting --- src/retime.jl | 13 +++++---- src/split.jl | 2 +- test/retime.jl | 77 +++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 72 insertions(+), 20 deletions(-) diff --git a/src/retime.jl b/src/retime.jl index d1531a17..569200e0 100644 --- a/src/retime.jl +++ b/src/retime.jl @@ -74,7 +74,9 @@ function retime( downsample = _toAggregationMethod(downsample) extrapolate = _toExtrapolationMethod(extrapolate) - new_values = __get_new_values(T, length(new_timestamps), size(values(ta), 2), extrapolate, skip_missing) + new_values = __get_new_values( + T, length(new_timestamps), size(values(ta), 2), extrapolate, skip_missing + ) old_timestamps = convert(Vector{DN}, timestamp(ta)) old_values = values(ta) @views begin @@ -124,10 +126,10 @@ function _retime!( new_values[i] = _extrapolate(extrapolate, x_new[i], x, old_values) else idx = if i < N - _get_idx(x, x_new[i], x_new[i+1]) + _get_idx(x, x_new[i], x_new[i + 1]) else # assume that the last interval is the same length as the second to last one - _get_idx(x, x_new[i], x_new[i] + (x_new[i] - x_new[i-1])) + _get_idx(x, x_new[i], x_new[i] + (x_new[i] - x_new[i - 1])) end if isempty(idx) @@ -189,7 +191,8 @@ function _upsample(::Linear, x_old, old_values, x) old_values[idx_prev] else old_values[idx_prev] + - (x - x_old[idx_prev]) * (old_values[idx_next] - old_values[idx_prev]) / (x_old[idx_next] - x_old[idx_prev]) + (x - x_old[idx_prev]) * (old_values[idx_next] - old_values[idx_prev]) / + (x_old[idx_next] - x_old[idx_prev]) end end @@ -227,4 +230,4 @@ _downsample(::Count, values_in_range) = count(!ismissing, values_in_range) _downsample(::Sum, values_in_range) = sum(values_in_range) _downsample(::Median, values_in_range) = median(values_in_range) _downsample(::First, values_in_range) = first(values_in_range) -_downsample(::Last, values_in_range) = last(values_in_range) \ No newline at end of file +_downsample(::Last, values_in_range) = last(values_in_range) diff --git a/src/split.jl b/src/split.jl index e7984b96..a8d3eb17 100644 --- a/src/split.jl +++ b/src/split.jl @@ -102,4 +102,4 @@ function _split(ts::AbstractVector{D}, period::Function) where {D<:TimeType} push!(idx, j:m) return Iterators.map(i -> ts[i], idx) -end \ No newline at end of file +end diff --git a/test/retime.jl b/test/retime.jl index d540c9b5..a6aa5583 100644 --- a/test/retime.jl +++ b/test/retime.jl @@ -5,7 +5,6 @@ using Dates using Statistics @testset "retime" begin - @testset "interpolation" begin @test TimeSeries._toInterpolationMethod(:linear) == TimeSeries.Linear() @test TimeSeries._toInterpolationMethod(:nearest) == TimeSeries.Nearest() @@ -29,7 +28,8 @@ using Statistics end @testset "extrapolation" begin - @test TimeSeries._toExtrapolationMethod(:fillconstant) == TimeSeries.FillConstant(0.0) + @test TimeSeries._toExtrapolationMethod(:fillconstant) == + TimeSeries.FillConstant(0.0) @test TimeSeries._toExtrapolationMethod(:nearest) == TimeSeries.NearestExtrapolate() @test TimeSeries._toExtrapolationMethod(:missing) == TimeSeries.MissingExtrapolate() @test TimeSeries._toExtrapolationMethod(:nan) == TimeSeries.NaNExtrapolate() @@ -41,7 +41,13 @@ using Statistics new_timestamps = collect(Dates.Date(2000):Dates.Week(1):Dates.Date(2001)) funcs = [mean, sum, minimum, maximum, last] - downsamples = [TimeSeries.Mean(), TimeSeries.Sum(), TimeSeries.Min(), TimeSeries.Max(), TimeSeries.Last()] + downsamples = [ + TimeSeries.Mean(), + TimeSeries.Sum(), + TimeSeries.Min(), + TimeSeries.Max(), + TimeSeries.Last(), + ] @testset for (func, downsample) in zip(funcs, downsamples) cl_new = retime(cl, new_timestamps; upsample=TimeSeries.Linear(), downsample) @@ -74,7 +80,12 @@ using Statistics @testset "single column interpolation" begin new_timestamps = collect(Dates.DateTime(2000):Dates.Hour(1):Dates.DateTime(2001)) - upsamples = [TimeSeries.Linear(), TimeSeries.Previous(), TimeSeries.Next(), TimeSeries.Nearest()] + upsamples = [ + TimeSeries.Linear(), + TimeSeries.Previous(), + TimeSeries.Next(), + TimeSeries.Nearest(), + ] @testset for upsample in upsamples cl_new = retime(cl, new_timestamps; upsample) @@ -118,10 +129,20 @@ using Statistics new_timestamps = collect(Dates.Date(2000):Dates.Week(1):Dates.Date(2001)) funcs = [mean, sum, minimum, maximum, last] - downsamples = [TimeSeries.Mean(), TimeSeries.Sum(), TimeSeries.Min(), TimeSeries.Max(), TimeSeries.Last()] + downsamples = [ + TimeSeries.Mean(), + TimeSeries.Sum(), + TimeSeries.Min(), + TimeSeries.Max(), + TimeSeries.Last(), + ] @testset for (func, downsample) in zip(funcs, downsamples) - - ohlc_new = retime(ohlc, new_timestamps; upsample=TimeSeries.Linear(), downsample=TimeSeries.Mean()) + ohlc_new = retime( + ohlc, + new_timestamps; + upsample=TimeSeries.Linear(), + downsample=TimeSeries.Mean(), + ) @test timestamp(ohlc_new) == new_timestamps @@ -136,7 +157,12 @@ using Statistics @testset "multi column interpolation" begin new_timestamps = collect(Dates.DateTime(2000):Dates.Hour(1):Dates.DateTime(2001)) - upsamples = [TimeSeries.Linear(), TimeSeries.Previous(), TimeSeries.Next(), TimeSeries.Nearest()] + upsamples = [ + TimeSeries.Linear(), + TimeSeries.Previous(), + TimeSeries.Next(), + TimeSeries.Nearest(), + ] @testset for upsample in upsamples ohlc_new = retime(ohlc, new_timestamps; upsample) @@ -178,9 +204,21 @@ using Statistics colnames(cl), ) - cl_new = retime(cl_missing, new_timestamps; upsample=:linear, downsample=:mean, skip_missing=false) + cl_new = retime( + cl_missing, + new_timestamps; + upsample=:linear, + downsample=:mean, + skip_missing=false, + ) - cl_new = retime(cl_missing, new_timestamps; upsample=:linear, downsample=:mean, skip_missing=true) + cl_new = retime( + cl_missing, + new_timestamps; + upsample=:linear, + downsample=:mean, + skip_missing=true, + ) @test !any(ismissing.(values(cl_new))) end @@ -196,10 +234,21 @@ using Statistics colnames(cl), ) - cl_new = retime(cl_missing, new_timestamps; upsample=:linear, downsample=:mean, skip_missing=false) + cl_new = retime( + cl_missing, + new_timestamps; + upsample=:linear, + downsample=:mean, + skip_missing=false, + ) - cl_new = retime(cl_missing, new_timestamps; upsample=:linear, downsample=:mean, skip_missing=true) + cl_new = retime( + cl_missing, + new_timestamps; + upsample=:linear, + downsample=:mean, + skip_missing=true, + ) @test !any(isnan.(values(cl_new))) end - -end \ No newline at end of file +end From 17e6d93ac66aef4c5e6609383cfb8734a96d70c1 Mon Sep 17 00:00:00 2001 From: Valentin Kaisermayer Date: Sat, 25 Jan 2025 20:22:47 +0100 Subject: [PATCH 4/6] fixes integer issue --- src/retime.jl | 43 +++++++++++++++++++++++++++++++------------ test/retime.jl | 26 ++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/src/retime.jl b/src/retime.jl index 569200e0..b94ba6f6 100644 --- a/src/retime.jl +++ b/src/retime.jl @@ -53,7 +53,8 @@ _toExtrapolationMethod(::Val{:nan}) = NaNExtrapolate() _toExtrapolationMethod(x::ExtrapolationMethod) = x function retime(ta, new_dt::Dates.Period; kwargs...) - new_timestamps = timestamp(ta)[1]:new_dt:timestamp(ta)[end] + new_timestamps = + floor(timestamp(ta)[1], new_dt):new_dt:floor(timestamp(ta)[end], new_dt) return retime(ta, new_timestamps; kwargs...) end @@ -74,8 +75,14 @@ function retime( downsample = _toAggregationMethod(downsample) extrapolate = _toExtrapolationMethod(extrapolate) - new_values = __get_new_values( - T, length(new_timestamps), size(values(ta), 2), extrapolate, skip_missing + new_values = __allocate_new_values( + T, + length(new_timestamps), + size(ta, 2), + upsample, + downsample, + extrapolate, + skip_missing, ) old_timestamps = convert(Vector{DN}, timestamp(ta)) old_values = values(ta) @@ -95,7 +102,6 @@ function retime( upsample, downsample, extrapolate, - skip_missing, ) end end @@ -110,10 +116,9 @@ function _retime!( upsample::InterpolationMethod, downsample::AggregationMethod, extrapolate::ExtrapolationMethod, - skip_missing::Bool, ) where {D,AN,A} x = Dates.value.(old_timestamps) - x_min, x_max = extrema(x) + x_min, x_max = x[1], x[end] # assume that the timestamps are sorted x_new = Dates.value.(new_timestamps) N = length(x_new) @@ -152,13 +157,23 @@ function _retime!( return nothing end -function __get_new_values(T, N, n, extrapolate, skip_missing) - return zeros(skip_missing ? nonmissingtype(T) : T, N, n) -end -function __get_new_values(T, N, n, extrapolate::MissingExtrapolate, skip_missing) - return zeros(Union{Missing,T}, N, n) +function __allocate_new_values(T, N, n, upsample, downsample, extrapolate, skip_missing) + T = skip_missing ? nonmissingtype(T) : T + new_type = promote_type( + T, __get_type(T, upsample), __get_type(T, downsample), __get_type(T, extrapolate) + ) + return zeros(new_type, N, n) end +__get_type(::Type{T}, ::InterpolationMethod) where {T} = T +__get_type(::Type{Int}, ::Linear) = Float64 # interpolating integers can result in floats + +__get_type(::Type{T}, ::AggregationMethod) where {T} = T +__get_type(::Type{Int}, ::Mean) = Float64 # aggregating integers can result in floats + +__get_type(::Type{T}, ::ExtrapolationMethod) where {T} = T +__get_type(::Type{T}, ::MissingExtrapolate) where {T} = Union{T,Missing} + function _get_idx(x::AbstractVector{<:Real}, x_left::Real, x_right::Real) idx_left = searchsortedfirst(x, x_left) # greater or equal to x_left idx_right = searchsortedlast(x, prevfloat(Float64(x_right))) # less to x_right @@ -171,7 +186,11 @@ function _extrapolate(m::FillConstant, t_new, x, y) end function _extrapolate(::NearestExtrapolate, t_new, x, y) - idx = argmin(abs.(x .- t_new)) + idx = if t_new < x[1] + 1 + else + length(x) + end return y[idx] end diff --git a/test/retime.jl b/test/retime.jl index a6aa5583..1914a120 100644 --- a/test/retime.jl +++ b/test/retime.jl @@ -251,4 +251,30 @@ using Statistics ) @test !any(isnan.(values(cl_new))) end + + @testset "Aggregate integers with :mean" begin + ta = TimeArray( + [ + DateTime(2025, 1, 1, 8, 0), + DateTime(2025, 1, 2, 2, 0), + DateTime(2025, 1, 3, 9, 0), + ], + [1, 2, 3], + ) + + ta_new = retime(ta, Day(1)) + end + + @testset "Interpolate integers with :linear" begin + ta = TimeArray( + [ + DateTime(2025, 1, 1, 8, 0), + DateTime(2025, 1, 2, 2, 0), + DateTime(2025, 1, 3, 9, 0), + ], + [1, 2, 3], + ) + + ta_new = retime(ta, Hour(1); upsample=:linear) + end end From 06d413ac3ba124bc77747edab711c6bc73665eb9 Mon Sep 17 00:00:00 2001 From: Valentin Kaisermayer Date: Sat, 25 Jan 2025 20:56:49 +0100 Subject: [PATCH 5/6] changes method that is used when only one sample is in the interval to downsampling --- src/retime.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/retime.jl b/src/retime.jl index b94ba6f6..b9c204c4 100644 --- a/src/retime.jl +++ b/src/retime.jl @@ -145,7 +145,9 @@ function _retime!( new_values[i] = old_values[idx[1]] else # Only one sample found in the interval x_new[i] and x_new[i+1] --> use the upsample method - new_values[i] = _upsample(upsample, x, old_values, x_new[i]) + # new_values[i] = _upsample(upsample, x, old_values, x_new[i]) + # Only one sample found in the interval x_new[i] and x_new[i+1] --> still use the downsample method? + new_values[i] = _downsample(downsample, old_values[idx]) end else # Multiple samples were found in the interval [x_new[i], x_new[i+1]) --> use the downsample method to get the agglomeration From 59c4c6cdef46e0694f73babdc4795e9c39a2f176 Mon Sep 17 00:00:00 2001 From: Valentin Kaisermayer Date: Sat, 25 Jan 2025 21:01:24 +0100 Subject: [PATCH 6/6] use colon instead of allocating an index --- src/retime.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/retime.jl b/src/retime.jl index b9c204c4..afe42173 100644 --- a/src/retime.jl +++ b/src/retime.jl @@ -88,10 +88,10 @@ function retime( old_values = values(ta) @views begin for col_i in 1:size(old_values, 2) - if skip_missing - idx = findall(x -> !ismissing(x) && !isnan(x), old_values[:, col_i]) + idx = if skip_missing + findall(x -> !ismissing(x) && !isnan(x), old_values[:, col_i]) else - idx = ones(Int, length(old_timestamps)) + Colon() end _retime!(