fix #63 + some efforts to fix #64

sl-solution · Jun 9, 2022 · 4d213f0 · 4d213f0
1 parent bc2cb3f
commit 4d213f0
Show file tree

Hide file tree

Showing 10 changed files with 154 additions and 23 deletions.
diff --git a/src/InMemoryDatasets.jl b/src/InMemoryDatasets.jl
@@ -63,6 +63,7 @@ export
       repeat!,
       select,
       select!,
+      delete,
       mapcols,
       insertcols!,
       mask,

diff --git a/src/dataset/combine.jl b/src/dataset/combine.jl
@@ -618,7 +618,7 @@ function combine(ds::Dataset, @nospecialize(args...); dropgroupcols = false, thr
     # we will use new_lengths later for assigning the grouping info of the new ds
     if _first_vector_res == 0
         new_lengths = ones(Int, ngroups)
-        cumsum!(new_lengths, new_lengths)
+        our_cumsum!(new_lengths)
         total_lengths = ngroups
     else
         if ms[_first_vector_res].first isa Tuple
@@ -637,7 +637,7 @@ function combine(ds::Dataset, @nospecialize(args...); dropgroupcols = false, thr
             _compute_the_mutli_row_trans!(special_res, new_lengths, _columns(ds)[index(ds)[ms[_first_vector_res].first]], nrow(ds), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
         end
         # special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
-        cumsum!(new_lengths, new_lengths)
+        our_cumsum!(new_lengths)
         total_lengths = new_lengths[end]
     end
     all_names = _names(ds)
@@ -715,7 +715,7 @@ function combine_ds(ds::AbstractDataset, @nospecialize(args...); threads = true)
     # we will use new_lengths later for assigning the grouping info of the new ds
     if _first_vector_res == 0
         new_lengths = ones(Int, ngroups)
-        cumsum!(new_lengths, new_lengths)
+        our_cumsum!(new_lengths)
         total_lengths = ngroups
     else
         if ms[_first_vector_res].first isa Tuple
@@ -734,7 +734,7 @@ function combine_ds(ds::AbstractDataset, @nospecialize(args...); threads = true)
             _compute_the_mutli_row_trans!(special_res, new_lengths, _columns(ds)[index(ds)[ms[_first_vector_res].first]], nrow(ds), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
         end
         # special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
-        cumsum!(new_lengths, new_lengths)
+        our_cumsum!(new_lengths)
         total_lengths = new_lengths[end]
     end
     all_names = _names(ds)

diff --git a/src/dataset/other.jl b/src/dataset/other.jl
@@ -974,7 +974,7 @@ A convenient shortcut for `ds[byrow(ds, type, cols; ...), :]`.
 
 `type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
 
-See [`byrow`](@ref), [`filter!`](@ref)
+See [`byrow`](@ref), [`filter!`](@ref), [`delete!`](@ref), [`delete`](@ref)
 
 # Examples
 
@@ -1056,10 +1056,110 @@ It is a convenient shortcut for `deleteat![ds, .!byrow(ds, type, cols; ...)]`.
 
 Refer to [`filter`](@ref) for exmaples.
 
-See [`byrow`](@ref), [`filter`](@ref)
+See [`byrow`](@ref), [`filter`](@ref), [`delete!`](@ref), [`delete`](@ref)
 """
 Base.filter!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; type = all, kwargs...) = deleteat!(ds, .!byrow(ds, type, cols; kwargs...))
 
+# filter out `true`s
+"""
+    delete(ds::AbstractDataset, cols; [type = all,...])
+
+A convenient shortcut for `ds[.!byrow(ds, type, cols; ...), :]`.
+
+`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
+
+Compare to [`deleteat!`](@ref)
+
+See [`delete!`](@ref), [`byrow`](@ref), [`filter!`](@ref), [`filter`](@ref)
+
+# Examples
+
+```jldoctest
+julia> ds = Dataset(x = [1,2,3,4,5], y = [1.5,2.3,-1,0,2.0], z = Bool[1,0,1,0,1])
+5×3 Dataset
+ Row │ x         y         z
+     │ identity  identity  identity
+     │ Int64?    Float64?  Bool?
+─────┼──────────────────────────────
+   1 │        1       1.5      true
+   2 │        2       2.3     false
+   3 │        3      -1.0      true
+   4 │        4       0.0     false
+   5 │        5       2.0      true
+
+julia> delete(ds, :z)
+2×3 Dataset
+ Row │ x         y         z        
+     │ identity  identity  identity 
+     │ Int64?    Float64?  Bool?    
+─────┼──────────────────────────────
+   1 │        2       2.3     false
+   2 │        4       0.0     false
+
+julia> delete(ds, 1:2, by = [iseven, >(2.0)])
+4×3 Dataset
+ Row │ x         y         z        
+     │ identity  identity  identity 
+     │ Int64?    Float64?  Bool?    
+─────┼──────────────────────────────
+   1 │        1       1.5      true
+   2 │        3      -1.0      true
+   3 │        4       0.0     false
+   4 │        5       2.0      true
+
+julia> delete(ds, 1:2, type = any, by = [iseven, >(2.0)])
+3×3 Dataset
+ Row │ x         y         z        
+     │ identity  identity  identity 
+     │ Int64?    Float64?  Bool?    
+─────┼──────────────────────────────
+   1 │        1       1.5      true
+   2 │        3      -1.0      true
+   3 │        5       2.0      true
+
+julia> delete(ds, 1:3, type = issorted, rev = true)
+3×3 Dataset
+ Row │ x         y         z        
+     │ identity  identity  identity 
+     │ Int64?    Float64?  Bool?    
+─────┼──────────────────────────────
+   1 │        1       1.5      true
+   2 │        2       2.3     false
+   3 │        3      -1.0      true
+
+julia> delete(ds, 2:3, type = isless, with = :x)
+2×3 Dataset
+ Row │ x         y         z        
+     │ identity  identity  identity 
+     │ Int64?    Float64?  Bool?    
+─────┼──────────────────────────────
+   1 │        1       1.5      true
+   2 │        2       2.3     false
+```
+"""
+function delete(ds::AbstractDataset, cols::Union{ColumnIndex, MultiColumnIndex}; view = false, type= all, kwargs...)
+    if view
+        Base.view(ds, .!byrow(ds, type, cols; kwargs...), :)
+    else
+        ds[.!byrow(ds, type, cols; kwargs...), :]
+    end
+end
+"""
+    delete!(ds::AbstractDataset, cols; [type = all, ...])
+
+Variant of `delete` which replaces the passed data set with the filtered one.
+
+It is a convenient shortcut for `deleteat![ds, byrow(ds, type, cols; ...)]`.
+
+`type` can be any function supported by `byrow` which returns a Vector{Bool} or BitVector.
+
+Compare to [`deleteat!`](@ref)
+
+Refer to [`delete`](@ref) for exmaples.
+
+See [`delete`](@ref), [`byrow`](@ref), [`filter`](@ref), [`filter!`](@ref)
+"""
+Base.delete!(ds::Dataset, cols::Union{ColumnIndex, MultiColumnIndex}; type = all, kwargs...) = deleteat!(ds, byrow(ds, type, cols; kwargs...))
 
 """
     mapcols(ds::AbstractDataset, f, cols)

diff --git a/src/dataset/transpose.jl b/src/dataset/transpose.jl
@@ -814,7 +814,7 @@ function flatten!(ds::Dataset,
          sort_permute_idxcols = [1]
          idxcols_sorted = idxcols
      end
-     cumsum!(lengths, lengths)
+     our_cumsum!(lengths)
      for col in 1:length(idxcols_sorted)
          col_to_flatten = all_idxcols[sort_permute_idxcols[col]]
 
@@ -884,7 +884,7 @@ function flatten(ds::AbstractDataset,
          sort_permute_idxcols = [1]
          idxcols_sorted = idxcols
      end
-     cumsum!(lengths, lengths)
+     our_cumsum!(lengths)
      for col in 1:length(idxcols_sorted)
          col_to_flatten = all_idxcols[sort_permute_idxcols[col]]
 

diff --git a/src/join/closejoin.jl b/src/join/closejoin.jl
@@ -336,7 +336,7 @@ function _join_closejoin(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, m
         ranges, a, idx, minval, reps, sz, right_cols_2= _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], onright[1:end-1], mapformats, true, Val(T), threads = threads)
         filter!(!=(0), reps)
         pushfirst!(reps, 1)
-        cumsum!(reps, reps)
+        our_cumsum!(reps)
         pop!(reps)
         grng = GIVENRANGE(idx, reps, Int[], length(reps))
         starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, oncols_right[end], stable, alg, mapformats, nsfpaj, grng, threads = threads)

diff --git a/src/join/join.jl b/src/join/join.jl
@@ -420,7 +420,7 @@ function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en,
             ranges[i] = 1:0
         end
     end
-    cumsum!(revised_ends, revised_ends)
+    our_cumsum!(revised_ends)
 end
 
 function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, lmf, rmf, j; type = :both, nsfpaj = true, threads = true)
@@ -487,7 +487,7 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map
         end
     end
     new_ends = map(x -> max(1, length(x)), ranges)
-    cumsum!(new_ends, new_ends)
+    our_cumsum!(new_ends)
     total_length = new_ends[end]
 
     if check
@@ -579,7 +579,7 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig
     end
 
     new_ends = map(x -> max(1, length(x)), ranges)
-    cumsum!(new_ends, new_ends)
+    our_cumsum!(new_ends)
     total_length = new_ends[end]
 
     if check
@@ -673,7 +673,7 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig
             ranges, a, idx, minval, reps, sz, right_cols_2 = _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], oncols_right[1:end-1], mapformats, true, Val(T); threads = threads)
             filter!(!=(0), reps)
             pushfirst!(reps, 1)
-            cumsum!(reps, reps)
+            our_cumsum!(reps)
             pop!(reps)
             grng = GIVENRANGE(idx, reps, Int[], length(reps))
             starts, idx, last_valid_range = _sort_for_join_after_hash(dsr, right_range_cols[1], stable, alg, mapformats, nsfpaj, grng; threads = threads)
@@ -700,7 +700,7 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig
 
 
     new_ends = map(length, ranges)
-    cumsum!(new_ends, new_ends)
+    our_cumsum!(new_ends)
     total_length = new_ends[end]
 
     inbits = nothing
@@ -896,7 +896,7 @@ function _join_outer(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeu
     end
     new_ends = map(x -> max(1, length(x)), ranges)
     notinleft = _find_right_not_in_left(ranges, nrow(dsr), idx)
-    cumsum!(new_ends, new_ends)
+    our_cumsum!(new_ends)
     total_length = new_ends[end] + length(notinleft)
 
     if check

diff --git a/src/join/join_dict.jl b/src/join/join_dict.jl
@@ -209,7 +209,7 @@ function _join_left_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T}
     _fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)
 
     new_ends = map(x -> max(1, length(x)), ranges)
-    cumsum!(new_ends, new_ends)
+    our_cumsum!(new_ends)
     total_length = new_ends[end]
 
     if check
@@ -292,7 +292,7 @@ function _join_left!_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T
         throw(ArgumentError("`leftjoin!` can only be used when each observation in left data set matches at most one observation from right data set"))
     end
     new_ends = map(x -> max(1, length(x)), ranges)
-    cumsum!(new_ends, new_ends)
+    our_cumsum!(new_ends)
     total_length = new_ends[end]
 
     if check
@@ -354,7 +354,7 @@ function _join_inner_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T
     _fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)
 
     new_ends = map(length, ranges)
-    cumsum!(new_ends, new_ends)
+    our_cumsum!(new_ends)
     total_length = new_ends[end]
 
     if check
@@ -430,7 +430,7 @@ function _join_outer_dict(dsl, dsr, ranges, onleft, onright, oncols_left, oncols
     _fill_ranges_for_dict_join!(ranges, dict, maxprob, _fl, _fr, _columns(dsl)[onleft[1]], _columns(dsr)[onright[1]], sz, type, threads = threads)
     new_ends = map(x -> max(1, length(x)), ranges)
     notinleft = _find_right_not_in_left(ranges, nrow(dsr), 1:nrow(dsr))
-    cumsum!(new_ends, new_ends)
+    our_cumsum!(new_ends)
     total_length = new_ends[end] + length(notinleft)
 
     if check

diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -93,6 +93,14 @@ function allocatecol(T, len)
     _our_vect_alloc(Union{Missing, T}, len)
 end
 
+function our_cumsum!(x)
+    @inbounds for i in 2:length(x)
+        x[i] += x[i-1]
+    end
+    x
+end
+
+
 function _generate_inverted_dict_pool(x)
     invp = DataAPI.invrefpool(x)
     if invp isa Dict
@@ -174,7 +182,7 @@ function _sortitout!(res, starts, x)
         starts[x[i] + 1] += 1
     end
 	starts_normalised = map(>(0), starts)
-    cumsum!(starts, starts)
+    our_cumsum!(starts)
     for i in 1:length(x)
         label = x[i]
         res[starts[label]] = i
@@ -226,7 +234,7 @@ function _calculate_ends(groups, ngroups, ::Val{T}) where T
     @inbounds for i = 1:length(groups)
         where[groups[i]] += 1
     end
-    START_END(false, length(groups), cumsum!(where, where))
+    START_END(false, length(groups), our_cumsum!(where))
 end
 
 

diff --git a/src/sort/groupby.jl b/src/sort/groupby.jl
@@ -196,7 +196,7 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
 	# we will use new_lengths later for assigning the grouping info of the new ds
 	if _first_vector_res == 0
 		new_lengths = ones(Int, ngroups)
-		cumsum!(new_lengths, new_lengths)
+		our_cumsum!(new_lengths)
 		total_lengths = ngroups
 	else
 		if ms[_first_vector_res].first isa Tuple
@@ -215,7 +215,7 @@ function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgrou
 			_compute_the_mutli_row_trans!(special_res, new_lengths, _threaded_permute_for_groupby(_columns(gds.parent)[index(gds.parent)[ms[_first_vector_res].first]], a[1], threads = threads), nrow(gds.parent), ms[_first_vector_res].second.first, _first_vector_res, starts, ngroups, threads)
 		end
 		# special_res, new_lengths = _compute_the_mutli_row_trans(ds, ms, _first_vector_res, starts, ngroups)
-		cumsum!(new_lengths, new_lengths)
+		our_cumsum!(new_lengths)
 		total_lengths = new_lengths[end]
 	end
 	all_names = _names(gds.parent)

diff --git a/test/data.jl b/test/data.jl
@@ -365,6 +365,28 @@ end
     @test byrow(ds, all, :, by = [>(5), ==(10)], threads = false) == [falses(500);trues(500)]
     @test byrow(mask(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), [>(5), ==(10)], [2,1], threads = false), all, threads = false) == [trues(500);falses(500)]
     @test byrow(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), all, [2,1], by = [>(5), ==(10)], threads = false) == [trues(500);falses(500)]
+
+
+    ds = Dataset(x=[3, 1, 2, 1], y=["b", "c", "a", "b"])
+    @test delete(ds, 1, by = >(1)) == Dataset(x=[1, 1], y=["c", "b"])
+    @test delete!(ds, 1, by = >(1)) === ds == Dataset(x=[1, 1], y=["c", "b"])
+
+    ds = Dataset(x=[3, 1, 2, 1], y=["b", "c", "a", "b"])
+    @test delete(ds, :x, by = >(1)) == Dataset(x=[1, 1], y=["c", "b"])
+    @test delete!(ds, :x, by = >(1)) === ds == Dataset(x=[1, 1], y=["c", "b"])
+
+    ds = Dataset(x = [1,2,missing,1], y = ["a", "d", "c", "f"])
+    @test delete(ds, :, type = all, by = [isequal(1), >("a")]) == ds[[true, true, true, false],:]
+    setformat!(ds, 1=>isodd)
+    @test delete(ds, :, type = all, by = [isequal(1), >("a")]) == ds[[true, true, true, false],:]
+    @test delete(ds, :, type = all, by = [isequal(1), >("a")], mapformats = true) == ds[[true, true, true, false],:]
+    @test delete(ds, :, by = [isequal(1), ==("a")], mapformats = true) == ds[[false, true, true, true],:]
+    setformat!(ds, 1=>iseven)
+    @test delete(ds, 1, by = isequal(1), mapformats = true) == ds[[true, false, true, true],:]
+
+    ds = Dataset(x = repeat(1:10, inner = 100), y = 10)
+    @test delete(ds, :, by = [>(5), ==(10)]) == ds[[trues(500);falses(500)],:]
+    @test delete(view(ds, nrow(ds):-1:1, ncol(ds):-1:1), [2,1], by = [>(5), ==(10)]) == view(ds, nrow(ds):-1:1, ncol(ds):-1:1)[[falses(500);trues(500)],:]
 end
 
 @testset "ffill, ffill!, bfill, bfill!" begin