Skip to content

Commit

Permalink
fix #63 + some efforts to fix #64
Browse files Browse the repository at this point in the history
  • Loading branch information
sl-solution committed Jun 9, 2022
1 parent bc2cb3f commit 4d213f0
Show file tree
Hide file tree
Showing 10 changed files with 154 additions and 23 deletions.
1 change: 1 addition & 0 deletions src/InMemoryDatasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ export
repeat!,
select,
select!,
delete,
mapcols,
insertcols!,
mask,
Expand Down
8 changes: 4 additions & 4 deletions src/dataset/combine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ function combine(ds::Dataset, @nospecialize(args...); dropgroupcols = false, thr
# we will use new_lengths later for assigning the grouping info of the new ds
if _first_vector_res == 0
new_lengths = ones(Int, ngroups)
cumsum!(new_lengths, new_lengths)
our_cumsum!(new_lengths)
total_lengths = ngroups
else
if ms[_first_vector_res].first isa Tuple
Expand All @@ -637,7 +637,7 @@ function combine(ds::Dataset, @nospecialize(args...); dropgroupcols = false, thr
_compute_the_mutli_row_trans!(special_res, new_lengths, _columns(ds)[index(ds)[ms[_first_vector_res].first]], nrow(ds), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
end
# special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
cumsum!(new_lengths, new_lengths)
our_cumsum!(new_lengths)
total_lengths = new_lengths[end]
end
all_names = _names(ds)
Expand Down Expand Up @@ -715,7 +715,7 @@ function combine_ds(ds::AbstractDataset, @nospecialize(args...); threads = true)
# we will use new_lengths later for assigning the grouping info of the new ds
if _first_vector_res == 0
new_lengths = ones(Int, ngroups)
cumsum!(new_lengths, new_lengths)
our_cumsum!(new_lengths)
total_lengths = ngroups
else
if ms[_first_vector_res].first isa Tuple
Expand All @@ -734,7 +734,7 @@ function combine_ds(ds::AbstractDataset, @nospecialize(args...); threads = true)
_compute_the_mutli_row_trans!(special_res, new_lengths, _columns(ds)[index(ds)[ms[_first_vector_res].first]], nrow(ds), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
end
# special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
cumsum!(new_lengths, new_lengths)
our_cumsum!(new_lengths)
total_lengths = new_lengths[end]
end
all_names = _names(ds)
Expand Down
104 changes: 102 additions & 2 deletions src/dataset/other.jl
Original file line number Diff line number Diff line change
Expand Up @@ -974,7 +974,7 @@ A convenient shortcut for `ds[byrow(ds, type, cols; ...), :]`.
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
See [`byrow`](@ref), [`filter!`](@ref)
See [`byrow`](@ref), [`filter!`](@ref), [`delete!`](@ref), [`delete`](@ref)
# Examples
Expand Down Expand Up @@ -1056,10 +1056,110 @@ It is a convenient shortcut for `deleteat![ds, .!byrow(ds, type, cols; ...)]`.
Refer to [`filter`](@ref) for exmaples.
See [`byrow`](@ref), [`filter`](@ref)
See [`byrow`](@ref), [`filter`](@ref), [`delete!`](@ref), [`delete`](@ref)
"""
Base.filter!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; type = all, kwargs...) = deleteat!(ds, .!byrow(ds, type, cols; kwargs...))

# filter out `true`s
"""
delete(ds::AbstractDataset, cols; [type = all,...])
A convenient shortcut for `ds[.!byrow(ds, type, cols; ...), :]`.
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
Compare to [`deleteat!`](@ref)
See [`delete!`](@ref), [`byrow`](@ref), [`filter!`](@ref), [`filter`](@ref)
# Examples
```jldoctest
julia> ds = Dataset(x = [1,2,3,4,5], y = [1.5,2.3,-1,0,2.0], z = Bool[1,0,1,0,1])
5×3 Dataset
Row │ x y z
│ identity identity identity
│ Int64? Float64? Bool?
─────┼──────────────────────────────
1 │ 1 1.5 true
2 │ 2 2.3 false
3 │ 3 -1.0 true
4 │ 4 0.0 false
5 │ 5 2.0 true
julia> delete(ds, :z)
2×3 Dataset
Row │ x y z
│ identity identity identity
│ Int64? Float64? Bool?
─────┼──────────────────────────────
1 │ 2 2.3 false
2 │ 4 0.0 false
julia> delete(ds, 1:2, by = [iseven, >(2.0)])
4×3 Dataset
Row │ x y z
│ identity identity identity
│ Int64? Float64? Bool?
─────┼──────────────────────────────
1 │ 1 1.5 true
2 │ 3 -1.0 true
3 │ 4 0.0 false
4 │ 5 2.0 true
julia> delete(ds, 1:2, type = any, by = [iseven, >(2.0)])
3×3 Dataset
Row │ x y z
│ identity identity identity
│ Int64? Float64? Bool?
─────┼──────────────────────────────
1 │ 1 1.5 true
2 │ 3 -1.0 true
3 │ 5 2.0 true
julia> delete(ds, 1:3, type = issorted, rev = true)
3×3 Dataset
Row │ x y z
│ identity identity identity
│ Int64? Float64? Bool?
─────┼──────────────────────────────
1 │ 1 1.5 true
2 │ 2 2.3 false
3 │ 3 -1.0 true
julia> delete(ds, 2:3, type = isless, with = :x)
2×3 Dataset
Row │ x y z
│ identity identity identity
│ Int64? Float64? Bool?
─────┼──────────────────────────────
1 │ 1 1.5 true
2 │ 2 2.3 false
```
"""
function delete(ds::AbstractDataset, cols::Union{ColumnIndex, MultiColumnIndex}; view = false, type= all, kwargs...)
if view
Base.view(ds, .!byrow(ds, type, cols; kwargs...), :)
else
ds[.!byrow(ds, type, cols; kwargs...), :]
end
end
"""
delete!(ds::AbstractDataset, cols; [type = all, ...])
Variant of `delete` which replaces the passed data set with the filtered one.
It is a convenient shortcut for `deleteat![ds, byrow(ds, type, cols; ...)]`.
`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
Compare to [`deleteat!`](@ref)
Refer to [`delete`](@ref) for exmaples.
See [`delete`](@ref), [`byrow`](@ref), [`filter`](@ref), [`filter!`](@ref)
"""
Base.delete!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; type = all, kwargs...) = deleteat!(ds, byrow(ds, type, cols; kwargs...))

"""
mapcols(ds::AbstractDataset, f, cols)
Expand Down
4 changes: 2 additions & 2 deletions src/dataset/transpose.jl
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ function flatten!(ds::Dataset,
sort_permute_idxcols = [1]
idxcols_sorted = idxcols
end
cumsum!(lengths, lengths)
our_cumsum!(lengths)
for col in 1:length(idxcols_sorted)
col_to_flatten = all_idxcols[sort_permute_idxcols[col]]

Expand Down Expand Up @@ -884,7 +884,7 @@ function flatten(ds::AbstractDataset,
sort_permute_idxcols = [1]
idxcols_sorted = idxcols
end
cumsum!(lengths, lengths)
our_cumsum!(lengths)
for col in 1:length(idxcols_sorted)
col_to_flatten = all_idxcols[sort_permute_idxcols[col]]

Expand Down
2 changes: 1 addition & 1 deletion src/join/closejoin.jl
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ function _join_closejoin(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, m
ranges, a, idx, minval, reps, sz, right_cols_2= _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], onright[1:end-1], mapformats, true, Val(T), threads = threads)
filter!(!=(0), reps)
pushfirst!(reps, 1)
cumsum!(reps, reps)
our_cumsum!(reps)
pop!(reps)
grng = GIVENRANGE(idx, reps, Int[], length(reps))
starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, oncols_right[end], stable, alg, mapformats, nsfpaj, grng, threads = threads)
Expand Down
12 changes: 6 additions & 6 deletions src/join/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en,
ranges[i] = 1:0
end
end
cumsum!(revised_ends, revised_ends)
our_cumsum!(revised_ends)
end

function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, lmf, rmf, j; type = :both, nsfpaj = true, threads = true)
Expand Down Expand Up @@ -487,7 +487,7 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map
end
end
new_ends = map(x -> max(1, length(x)), ranges)
cumsum!(new_ends, new_ends)
our_cumsum!(new_ends)
total_length = new_ends[end]

if check
Expand Down Expand Up @@ -579,7 +579,7 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig
end

new_ends = map(x -> max(1, length(x)), ranges)
cumsum!(new_ends, new_ends)
our_cumsum!(new_ends)
total_length = new_ends[end]

if check
Expand Down Expand Up @@ -673,7 +673,7 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig
ranges, a, idx, minval, reps, sz, right_cols_2 = _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], oncols_right[1:end-1], mapformats, true, Val(T); threads = threads)
filter!(!=(0), reps)
pushfirst!(reps, 1)
cumsum!(reps, reps)
our_cumsum!(reps)
pop!(reps)
grng = GIVENRANGE(idx, reps, Int[], length(reps))
starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, right_range_cols[1], stable, alg, mapformats, nsfpaj, grng; threads = threads)
Expand All @@ -700,7 +700,7 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig


new_ends = map(length, ranges)
cumsum!(new_ends, new_ends)
our_cumsum!(new_ends)
total_length = new_ends[end]

inbits = nothing
Expand Down Expand Up @@ -896,7 +896,7 @@ function _join_outer(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeu
end
new_ends = map(x -> max(1, length(x)), ranges)
notinleft = _find_right_not_in_left(ranges, nrow(dsr), idx)
cumsum!(new_ends, new_ends)
our_cumsum!(new_ends)
total_length = new_ends[end] + length(notinleft)

if check
Expand Down
8 changes: 4 additions & 4 deletions src/join/join_dict.jl
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ function _join_left_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T}
_fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)

new_ends = map(x -> max(1, length(x)), ranges)
cumsum!(new_ends, new_ends)
our_cumsum!(new_ends)
total_length = new_ends[end]

if check
Expand Down Expand Up @@ -292,7 +292,7 @@ function _join_left!_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T
throw(ArgumentError("`leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set"))
end
new_ends = map(x -> max(1, length(x)), ranges)
cumsum!(new_ends, new_ends)
our_cumsum!(new_ends)
total_length = new_ends[end]

if check
Expand Down Expand Up @@ -354,7 +354,7 @@ function _join_inner_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T
_fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)

new_ends = map(length, ranges)
cumsum!(new_ends, new_ends)
our_cumsum!(new_ends)
total_length = new_ends[end]

if check
Expand Down Expand Up @@ -430,7 +430,7 @@ function _join_outer_dict(dsl, dsr, ranges, onleft, onright, oncols_left, oncols
_fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)
new_ends = map(x -> max(1, length(x)), ranges)
notinleft = _find_right_not_in_left(ranges, nrow(dsr), 1:nrow(dsr))
cumsum!(new_ends, new_ends)
our_cumsum!(new_ends)
total_length = new_ends[end] + length(notinleft)

if check
Expand Down
12 changes: 10 additions & 2 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ function allocatecol(T, len)
_our_vect_alloc(Union{Missing, T}, len)
end

function our_cumsum!(x)
@inbounds for i in 2:length(x)
x[i] += x[i-1]
end
x
end


function _generate_inverted_dict_pool(x)
invp = DataAPI.invrefpool(x)
if invp isa Dict
Expand Down Expand Up @@ -174,7 +182,7 @@ function _sortitout!(res, starts, x)
starts[x[i] + 1] += 1
end
starts_normalised = map(>(0), starts)
cumsum!(starts, starts)
our_cumsum!(starts)
for i in 1:length(x)
label = x[i]
res[starts[label]] = i
Expand Down Expand Up @@ -226,7 +234,7 @@ function _calculate_ends(groups, ngroups, ::Val{T}) where T
@inbounds for i = 1:length(groups)
where[groups[i]] += 1
end
START_END(false, length(groups), cumsum!(where, where))
START_END(false, length(groups), our_cumsum!(where))
end


Expand Down
4 changes: 2 additions & 2 deletions src/sort/groupby.jl
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
# we will use new_lengths later for assigning the grouping info of the new ds
if _first_vector_res == 0
new_lengths = ones(Int, ngroups)
cumsum!(new_lengths, new_lengths)
our_cumsum!(new_lengths)
total_lengths = ngroups
else
if ms[_first_vector_res].first isa Tuple
Expand All @@ -215,7 +215,7 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
_compute_the_mutli_row_trans!(special_res, new_lengths, _threaded_permute_for_groupby(_columns(gds.parent)[index(gds.parent)[ms[_first_vector_res].first]], a[1], threads = threads), nrow(gds.parent), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
end
# special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
cumsum!(new_lengths, new_lengths)
our_cumsum!(new_lengths)
total_lengths = new_lengths[end]
end
all_names = _names(gds.parent)
Expand Down
22 changes: 22 additions & 0 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,28 @@ end
@test byrow(ds, all, :, by = [>(5), ==(10)], threads = false) == [falses(500);trues(500)]
@test byrow(mask(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), [>(5), ==(10)], [2,1], threads = false), all, threads = false) == [trues(500);falses(500)]
@test byrow(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), all, [2,1], by = [>(5), ==(10)], threads = false) == [trues(500);falses(500)]


ds = Dataset(x=[3, 1, 2, 1], y=["b", "c", "a", "b"])
@test delete(ds, 1, by = >(1)) == Dataset(x=[1, 1], y=["c", "b"])
@test delete!(ds, 1, by = >(1)) === ds == Dataset(x=[1, 1], y=["c", "b"])

ds = Dataset(x=[3, 1, 2, 1], y=["b", "c", "a", "b"])
@test delete(ds, :x, by = >(1)) == Dataset(x=[1, 1], y=["c", "b"])
@test delete!(ds, :x, by = >(1)) === ds == Dataset(x=[1, 1], y=["c", "b"])

ds = Dataset(x = [1,2,missing,1], y = ["a", "d", "c", "f"])
@test delete(ds, :, type = all, by = [isequal(1), >("a")]) == ds[[true, true, true, false],:]
setformat!(ds, 1=>isodd)
@test delete(ds, :, type = all, by = [isequal(1), >("a")]) == ds[[true, true, true, false],:]
@test delete(ds, :, type = all, by = [isequal(1), >("a")], mapformats = true) == ds[[true, true, true, false],:]
@test delete(ds, :, by = [isequal(1), ==("a")], mapformats = true) == ds[[false, true, true, true],:]
setformat!(ds, 1=>iseven)
@test delete(ds, 1, by = isequal(1), mapformats = true) == ds[[true, false, true, true],:]

ds = Dataset(x = repeat(1:10, inner = 100), y = 10)
@test delete(ds, :, by = [>(5), ==(10)]) == ds[[trues(500);falses(500)],:]
@test delete(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), [2,1], by = [>(5), ==(10)]) == view(ds, nrow(ds):-1:1, ncol(ds):-1:1)[[falses(500);trues(500)],:]
end

@testset "ffill, ffill!, bfill, bfill!" begin
Expand Down

0 comments on commit 4d213f0

Please sign in to comment.