diff --git a/asv_bench/benchmarks/move.py b/asv_bench/benchmarks/move.py index 9e24560d89..47056d9bac 100644 --- a/asv_bench/benchmarks/move.py +++ b/asv_bench/benchmarks/move.py @@ -40,6 +40,9 @@ def time_move_argmax(self, dtype, shape, window): def time_move_median(self, dtype, shape, window): bn.move_median(self.arr, window) + def time_move_quantile(self, dtype, shape, window): + bn.move_quantile(self.arr, window, q=0.25) + def time_move_rank(self, dtype, shape, window): bn.move_rank(self.arr, window) @@ -83,6 +86,9 @@ def time_move_argmax(self, dtype, shape, order, axis, window): def time_move_median(self, dtype, shape, order, axis, window): bn.move_median(self.arr, window, axis=axis) + + def time_move_quantile(self, dtype, shape, order, axis, window): + bn.move_quantile(self.arr, window, axis=axis, q=0.25) def time_move_rank(self, dtype, shape, order, axis, window): bn.move_rank(self.arr, window, axis=axis) diff --git a/bottleneck/__init__.py b/bottleneck/__init__.py index c5495b3337..d99fc782ed 100644 --- a/bottleneck/__init__.py +++ b/bottleneck/__init__.py @@ -5,7 +5,10 @@ from . import slow from ._pytesttester import PytestTester from .move import (move_argmax, move_argmin, move_max, move_mean, move_median, - move_min, move_rank, move_std, move_sum, move_var) + move_min, move_rank, move_std, move_sum, move_var) + +from .src.move_quantile import move_quantile + from .nonreduce import replace from .nonreduce_axis import (argpartition, nanrankdata, partition, push, rankdata) diff --git a/bottleneck/benchmark/bench.py b/bottleneck/benchmark/bench.py index f18ec5a19c..408a332028 100644 --- a/bottleneck/benchmark/bench.py +++ b/bottleneck/benchmark/bench.py @@ -198,6 +198,8 @@ def getsetups(setup, shapes, nans, axes, dtype, order): run = {} run["name"] = func run["statements"] = ["bn_func(a, w, 1, axis)", "sw_func(a, w, 1, axis)"] + if func == "move_quantile": + run["statements"] = ["bn_func(a, w, 1, axis, q=0.25)", "sw_func(a, w, 1, axis, q=0.25)"] setup = """ from bottleneck.slow.move import %s as sw_func from bottleneck import %s as bn_func diff --git a/bottleneck/benchmark/bench_detailed.py b/bottleneck/benchmark/bench_detailed.py index 4907446414..da30ba1ec0 100644 --- a/bottleneck/benchmark/bench_detailed.py +++ b/bottleneck/benchmark/bench_detailed.py @@ -94,12 +94,14 @@ def benchsuite(function, fraction_nan): index = 0 elif function in ["rankdata", "nanrankdata"]: index = 0 - elif function in bn.get_functions("move", as_string=True): + elif function in bn.get_functions("move", as_string=True) and function != "move_quantile": index = 1 elif function in ["partition", "argpartition", "push"]: index = 2 elif function == "replace": index = 3 + elif function == "move_quantile": + index = 4 else: raise ValueError("`function` (%s) not recognized" % function) @@ -133,23 +135,24 @@ def get_instructions(): "(a, 1)", # move "(a, 0)", # (arg)partition "(a, np.nan, 0)", # replace + "(a, 1, q=0.25)", # move_quantile 10, ), - ("rand(10)", "(a)", "(a, 2)", "(a, 2)", "(a, np.nan, 0)", 10), - ("rand(100)", "(a)", "(a, 20)", "(a, 20)", "(a, np.nan, 0)", 6), - ("rand(1000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", 3), - ("rand(1000000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", 2), + ("rand(10)", "(a)", "(a, 2)", "(a, 2)", "(a, np.nan, 0)", "(a, 2, q=0.25)", 10), + ("rand(100)", "(a)", "(a, 20)", "(a, 20)", "(a, np.nan, 0)", "(a, 20, q=0.25)", 6), + ("rand(1000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", "(a, 200, q=0.25)", 3), + ("rand(1000000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", None, 2), # 2d input array - ("rand(10, 10)", "(a)", "(a, 2)", "(a, 2)", "(a, np.nan, 0)", 6), - ("rand(100, 100)", "(a)", "(a, 20)", "(a, 20)", "(a, np.nan, 0)", 3), - ("rand(1000, 1000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", 2), - ("rand(10, 10)", "(a, 1)", None, None, None, 6), - ("rand(100, 100)", "(a, 1)", None, None, None, 3), - ("rand(1000, 1000)", "(a, 1)", None, None, None, 2), - ("rand(100000, 2)", "(a, 1)", "(a, 1)", "(a, 1)", None, 2), - ("rand(10, 10)", "(a, 0)", None, None, None, 6), - ("rand(100, 100)", "(a, 0)", "(a, 20, axis=0)", None, None, 3), - ("rand(1000, 1000)", "(a, 0)", "(a, 200, axis=0)", None, None, 2), + ("rand(10, 10)", "(a)", "(a, 2)", "(a, 2)", "(a, np.nan, 0)", "(a, 2, q=0.25)", 6), + ("rand(100, 100)", "(a)", "(a, 20)", "(a, 20)", "(a, np.nan, 0)", "(a, 20, q=0.25)", 3), + ("rand(1000, 1000)", "(a)", "(a, 200)", "(a, 200)", "(a, np.nan, 0)", None ,2), + ("rand(10, 10)", "(a, 1)", None, None, None, None, 6), + ("rand(100, 100)", "(a, 1)", None, None, None, None, 3), + ("rand(1000, 1000)", "(a, 1)", None, None, None, None, 2), + ("rand(100000, 2)", "(a, 1)", "(a, 1)", "(a, 1)", None, None, 2), + ("rand(10, 10)", "(a, 0)", None, None, None, None, 6), + ("rand(100, 100)", "(a, 0)", "(a, 20, axis=0)", None, None, None, 3), + ("rand(1000, 1000)", "(a, 0)", "(a, 200, axis=0)", None, None, None, 2), # 3d input array ( "rand(100, 100, 100)", @@ -157,6 +160,7 @@ def get_instructions(): "(a, 20, axis=0)", "(a, 20, axis=0)", None, + "(a, 20, axis=0, q=0.25)", 2, ), ( @@ -165,6 +169,7 @@ def get_instructions(): "(a, 20, axis=1)", "(a, 20, axis=1)", None, + "(a, 20, axis=1, q=0.25)", 2, ), ( @@ -173,10 +178,11 @@ def get_instructions(): "(a, 20, axis=2)", "(a, 20, axis=2)", "(a, np.nan, 0)", + "(a, 20, axis=2, q=0.25)", 2, ), # 0d input array - ("array(1.0)", "(a)", None, None, "(a, 0, 2)", 10), + ("array(1.0)", "(a)", None, None, "(a, 0, 2)", None, 10), ] return instructions diff --git a/bottleneck/slow/move.py b/bottleneck/slow/move.py index 0aa06f1419..27e6c5dd37 100644 --- a/bottleneck/slow/move.py +++ b/bottleneck/slow/move.py @@ -14,6 +14,7 @@ "move_argmin", "move_argmax", "move_median", + "move_quantile", "move_rank", ] @@ -37,13 +38,15 @@ def move_var(a, window, min_count=None, axis=-1, ddof=0): "Slow move_var for unaccelerated dtype" return move_func(np.nanvar, a, window, min_count, axis=axis, ddof=ddof) - -def move_min(a, window, min_count=None, axis=-1): +# move_min, move_max, and move_median from bn.slow can be called +# from bn.move_quantile in case of byte swapped input array, +# and so can take `q` argument, hence add **kwargs to these functions +def move_min(a, window, min_count=None, axis=-1, **kwargs): "Slow move_min for unaccelerated dtype" return move_func(np.nanmin, a, window, min_count, axis=axis) -def move_max(a, window, min_count=None, axis=-1): +def move_max(a, window, min_count=None, axis=-1, **kwargs): "Slow move_max for unaccelerated dtype" return move_func(np.nanmax, a, window, min_count, axis=axis) @@ -100,16 +103,43 @@ def argmax(a, axis): return move_func(argmax, a, window, min_count, axis=axis) -def move_median(a, window, min_count=None, axis=-1): +def move_median(a, window, min_count=None, axis=-1, **kwargs): "Slow move_median for unaccelerated dtype" return move_func(np.nanmedian, a, window, min_count, axis=axis) +# keyword argument for interpolation method in np.nanquantile was changed in 1.22.0 +import pkg_resources +if pkg_resources.parse_version(np.__version__) >= pkg_resources.parse_version("1.22.0"): + METHOD_KEYWORD = "method" +else: + METHOD_KEYWORD = "interpolation" + +def move_quantile(a, window, min_count=None, axis=-1, q=0.5, **kwargs): + "Slow move_quantile for unaccelerated dtype" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + if not np.isinf(a).any(): + kwargs[METHOD_KEYWORD] = 'midpoint' + return move_func(np.nanquantile, a, window, min_count, axis=axis, q=q, **kwargs) + else: + return move_func(np_nanquantile_infs, a, window, min_count, axis=axis, q=q, **kwargs) + def move_rank(a, window, min_count=None, axis=-1): "Slow move_rank for unaccelerated dtype" return move_func(lastrank, a, window, min_count, axis=axis) +# function for handling infs in np.nanquantile +def np_nanquantile_infs(a, **kwargs): + kwargs[METHOD_KEYWORD] = 'lower' + lower_nanquantile = np.nanquantile(a, **kwargs) + kwargs[METHOD_KEYWORD] = 'higher' + higher_nanquantile = np.nanquantile(a, **kwargs) + + midpoint_nanquantile = (lower_nanquantile + higher_nanquantile) / 2 + return midpoint_nanquantile + # magic utility functions --------------------------------------------------- diff --git a/bottleneck/src/move_median/move_median.c b/bottleneck/src/move_median/move_median.c index 3f0709abe4..7c49ac72e4 100644 --- a/bottleneck/src/move_median/move_median.c +++ b/bottleneck/src/move_median/move_median.c @@ -27,6 +27,12 @@ idx1 = idx2 /* helper functions */ static inline ai_t mm_get_median(mm_handle *mm); +static inline ai_t mm_get_median_no_nans_full_window_odd(mm_handle *mm); +static inline ai_t mm_get_median_no_nans_full_window_even(mm_handle *mm); +static inline idx_t mm_get_nl(mm_handle *mm, idx_t idx, short adjustment); +static inline ai_t mq_get_quantile(mm_handle *mq); +static inline idx_t mq_k_stat(mm_handle *mq, idx_t idx, short adjustment); +static inline short mq_stat_exact(mm_handle *mq, idx_t idx); static inline void heapify_small_node(mm_handle *mm, idx_t idx); static inline void heapify_large_node(mm_handle *mm, idx_t idx); static inline idx_t mm_get_smallest_child(mm_node **heap, idx_t window, @@ -69,16 +75,43 @@ mm_new(const idx_t window, idx_t min_count) { mm->window = window; mm->odd = window % 2; mm->min_count = min_count; + mm->get_statistic = mm_get_median; + mm->get_index = mm_get_nl; mm_reset(mm); return mm; } +/* Same as mm_new, but the heaps have different sizes */ +mm_handle * +mq_new(const idx_t window, idx_t min_count, double quantile) { + mm_handle *mq = malloc(sizeof(mm_handle)); + mq->nodes = malloc(window * sizeof(mm_node*)); + mq->node_data = malloc(window * sizeof(mm_node)); + + mq->quantile = quantile; + + mq->s_heap = mq->nodes; + idx_t k_stat = mq_k_stat(mq, window, 0) + 1; + mq->l_heap = &mq->nodes[k_stat]; + + mq->window = window; + mq->odd = window % 2; + mq->min_count = min_count; + mq->get_statistic = mq_get_quantile; + mq->get_index = mq_k_stat; + + mm_reset(mq); + + return mq; +} + /* Insert a new value, ai, into one of the heaps. Use this function when * the heaps contain less than window-1 nodes. Returns the median value. - * Once there are window-1 nodes in the heap, switch to using mm_update. */ + * Once there are window-1 nodes in the heap, switch to using mm_update. + * Used by both mm and mq */ ai_t mm_update_init(mm_handle *mm, ai_t ai) { mm_node *node; @@ -99,7 +132,8 @@ mm_update_init(mm_handle *mm, ai_t ai) { } else { /* at least one node already exists in the heaps */ mm->newest->next = node; - if (n_s > n_l) { + idx_t k_stat = mm->get_index(mm, n_s + n_l + 1, 0); + if (n_s > k_stat) { /* add new node to large heap */ mm->l_heap[n_l] = node; node->region = LH; @@ -120,14 +154,27 @@ mm_update_init(mm_handle *mm, ai_t ai) { mm->newest = node; - return mm_get_median(mm); + return mm->get_statistic(mm); +} + +/* For non-nan move_median, as soon as heap has window nodes, switch to + * more explicit median functions, to avoid redundant calculations*/ +void +mm_update_statistic_function(mm_handle *mm) { + if (mm->odd) { + mm->get_statistic = mm_get_median_no_nans_full_window_odd; + } else { + mm->get_statistic = mm_get_median_no_nans_full_window_even; + } + return; } /* Insert a new value, ai, into the double heap structure. Use this function * when the double heap contains at least window-1 nodes. Returns the median * value. If there are less than window-1 nodes in the heap, use - * mm_update_init. */ + * mm_update_init. + * Used by both mm and mq */ ai_t mm_update(mm_handle *mm, ai_t ai) { /* node is oldest node with ai of newest node */ @@ -147,11 +194,7 @@ mm_update(mm_handle *mm, ai_t ai) { } /* return the median */ - if (mm->odd) { - return mm->s_heap[0]->ai; - } else { - return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; - } + return mm->get_statistic(mm); } @@ -177,17 +220,44 @@ mm_new_nan(const idx_t window, idx_t min_count) { mm->window = window; mm->min_count = min_count; + mm->get_statistic = mm_get_median; + mm->get_index = mm_get_nl; mm_reset(mm); return mm; } +/* Same as mm_new_nan, but the heaps have different sizes */ +mm_handle * +mq_new_nan(const idx_t window, idx_t min_count, double quantile) { + mm_handle *mq = malloc(sizeof(mm_handle)); + mq->nodes = malloc(2 * window * sizeof(mm_node*)); + mq->node_data = malloc(window * sizeof(mm_node)); + + mq->quantile = quantile; + + mq->s_heap = mq->nodes; + idx_t k_stat = mq_k_stat(mq, window, 0) + 1; + mq->l_heap = &mq->nodes[k_stat]; + mq->n_array = &mq->nodes[window]; + + mq->window = window; + mq->min_count = min_count; + mq->get_statistic = mq_get_quantile; + mq->get_index = mq_k_stat; + + mm_reset(mq); + + return mq; +} + /* Insert a new value, ai, into one of the heaps or the nan array. Use this * function when there are less than window-1 nodes. Returns the median * value. Once there are window-1 nodes in the heap, switch to using - * mm_update_nan. */ + * mm_update_nan. + * Used by both mm and mq*/ ai_t mm_update_init_nan(mm_handle *mm, ai_t ai) { mm_node *node; @@ -226,7 +296,9 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { } else { /* at least one node already exists in the heaps */ mm->newest->next = node; - if (n_s > n_l) { + /* number of non-nan nodes including the new node is n_s + n_l + 1 */ + idx_t k_stat = mm->get_index(mm, n_s + n_l + 1, 0); + if (n_s > k_stat) { /* add new node to large heap */ mm->l_heap[n_l] = node; node->region = LH; @@ -247,13 +319,14 @@ mm_update_init_nan(mm_handle *mm, ai_t ai) { } mm->newest = node; - return mm_get_median(mm); + return mm->get_statistic(mm); } /* Insert a new value, ai, into one of the heaps or the nan array. Use this * function when there are at least window-1 nodes. Returns the median value. - * If there are less than window-1 nodes, use mm_update_init_nan. */ + * If there are less than window-1 nodes, use mm_update_init_nan. + * Used by both mm and mq */ ai_t mm_update_nan(mm_handle *mm, ai_t ai) { idx_t n_s, n_l, n_n; @@ -324,7 +397,8 @@ mm_update_nan(mm_handle *mm, ai_t ai) { s_heap[idx]->idx = idx; heapify_small_node(mm, idx); } - if (mm->n_s < mm->n_l) { + idx_t k_stat = mm->get_index(mm, mm->n_s + mm->n_l, 1); + if (mm->n_s < k_stat) { /* move head node from the large heap to the small heap */ node2 = mm->l_heap[0]; node2->idx = mm->n_s; @@ -375,7 +449,8 @@ mm_update_nan(mm_handle *mm, ai_t ai) { } else { mm->l_first_leaf = FIRST_LEAF(mm->n_l); } - if (mm->n_l < mm->n_s - 1) { + idx_t k_stat = mm->get_index(mm, mm->n_s + mm->n_l, 0); + if (mm->n_s > k_stat + 1) { /* move head node from the small heap to the large heap */ node2 = mm->s_heap[0]; node2->idx = mm->n_l; @@ -412,7 +487,9 @@ mm_update_nan(mm_handle *mm, ai_t ai) { heapify_large_node(mm, idx); } else { /* ai is not NaN but oldest node is in nan array */ - if (n_s > n_l) { + // k_stat = mm_k_stat(mm, n_s + n_l + 1); + idx_t k_stat = mm->get_index(mm, n_s + n_l + 1, 0); + if (n_s > k_stat) { /* insert into large heap */ node->region = LH; node->idx = n_l; @@ -437,7 +514,7 @@ mm_update_nan(mm_handle *mm, ai_t ai) { --mm->n_n; } } - return mm_get_median(mm); + return mm->get_statistic(mm); } @@ -459,6 +536,12 @@ mm_reset(mm_handle *mm) { mm->l_first_leaf = 0; } +/* For non-nan move_median, need to reset the statistic function as well */ +void +mm_reset_statistic_function(mm_handle *mm) { + mm->get_statistic = mm_get_median; + return; +} /* After bn.move_median is done, free the memory */ void @@ -486,7 +569,49 @@ mm_get_median(mm_handle *mm) { return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; } +static inline ai_t +mm_get_median_no_nans_full_window_odd(mm_handle *mm) { + return mm->s_heap[0]->ai; +} + +static inline ai_t +mm_get_median_no_nans_full_window_even(mm_handle *mm) { + return (mm->s_heap[0]->ai + mm->l_heap[0]->ai) / 2.0; +} + +/* Return the current quantile */ +static inline ai_t +mq_get_quantile(mm_handle *mq) { + idx_t n_total = mq->n_l + mq->n_s; + if (n_total < mq->min_count) + return MM_NAN(); + if (mq_stat_exact(mq, n_total)) + return mq->s_heap[0]->ai; + return (mq->s_heap[0]->ai + mq->l_heap[0]->ai) / 2.0; +} + +/* function to check if the current index of the quantile is integer. If so, + * then the quantile is the element at the top of the heap. Otherwise take a midpoint */ +static inline short mq_stat_exact(mm_handle *mq, idx_t idx) { + return ((idx - 1) * mq->quantile) == floor((idx - 1) * mq->quantile); +} + +/* helper function to get mm->n_l while having same functions for mm and mq + * This will be assigned to mm->get_index */ +static inline idx_t mm_get_nl(mm_handle *mm, idx_t idx, short adjustment) { + return mm->n_l; +} + +/* function to find the index of element correspodning to the current quantile + * Due some some off-by-one nuances, one place in mm_update_nan requires + * this to return index + 1 (to be consistent with mm). Hence use adjustment arg + * This will be assigned to mq->get_index */ +static inline idx_t mq_k_stat(mm_handle *mq, idx_t idx, short adjustment) { + return (idx_t) (floor((idx - 1) * mq->quantile) + adjustment); +} + +/*heapify_small/large_node is same for mm and mq */ static inline void heapify_small_node(mm_handle *mm, idx_t idx) { idx_t idx2; diff --git a/bottleneck/src/move_median/move_median.h b/bottleneck/src/move_median/move_median.h index ccc3176096..5ca39acd44 100644 --- a/bottleneck/src/move_median/move_median.h +++ b/bottleneck/src/move_median/move_median.h @@ -35,6 +35,8 @@ struct _mm_node { }; typedef struct _mm_node mm_node; +/* this struct is used by both move_median(mm) and move_quantile(mq) */ +typedef struct _mm_handle mm_handle; struct _mm_handle { idx_t window; /* window size */ int odd; /* is window even (0) or odd (1) */ @@ -51,20 +53,34 @@ struct _mm_handle { mm_node *newest; /* The newest node (most recent insert) */ idx_t s_first_leaf; /* All nodes this index or greater are leaf nodes */ idx_t l_first_leaf; /* All nodes this index or greater are leaf nodes */ + double quantile; /* Value between 0 and 1, the quantile to compute */ + + /* get_median for move_median, get_quantile for move_quantile */ + ai_t (*get_statistic)(mm_handle *); + /* returns n_l for median, returns index before quantile for quantile */ + idx_t (*get_index)(mm_handle *, idx_t, short); }; -typedef struct _mm_handle mm_handle; /* non-nan functions */ mm_handle *mm_new(const idx_t window, idx_t min_count); +mm_handle *mq_new(const idx_t window, idx_t min_count, double quantile); +/* used by both mm and mq */ ai_t mm_update_init(mm_handle *mm, ai_t ai); ai_t mm_update(mm_handle *mm, ai_t ai); +/* helper functions for non-nan move_median */ +void mm_update_statistic_function(mm_handle *mm); +void mm_reset_statistic_function(mm_handle *mm); + /* nan functions */ mm_handle *mm_new_nan(const idx_t window, idx_t min_count); +mm_handle *mq_new_nan(const idx_t window, idx_t min_count, double quantile); +/* used by both mm and mq */ ai_t mm_update_init_nan(mm_handle *mm, ai_t ai); ai_t mm_update_nan(mm_handle *mm, ai_t ai); /* functions common to non-nan and nan cases */ +/* used by both mm and mq */ void mm_reset(mm_handle *mm); void mm_free(mm_handle *mm); diff --git a/bottleneck/src/move_quantile.py b/bottleneck/src/move_quantile.py new file mode 100644 index 0000000000..1013c7fc4a --- /dev/null +++ b/bottleneck/src/move_quantile.py @@ -0,0 +1,13 @@ +from ..move import move_quantile as move_quantile_c +from collections.abc import Iterable +import numpy as np + +def move_quantile(a, window, q, min_count=None, axis=-1): + if not isinstance(q, Iterable): + return move_quantile_c(a, window=window, min_count=min_count, axis=axis, q=q) + result = np.asarray( + [move_quantile_c(a=a, window=window, min_count=min_count, axis=axis, q=quantile) for quantile in q] + ) + return result + +move_quantile.__doc__ = move_quantile_c.__doc__ \ No newline at end of file diff --git a/bottleneck/src/move_template.c b/bottleneck/src/move_template.c index 8a589703d1..bdd4fc0b95 100644 --- a/bottleneck/src/move_template.c +++ b/bottleneck/src/move_template.c @@ -30,23 +30,38 @@ int window, \ int min_count, \ int axis, \ - int ddof) + int ddof, \ + double quantile) /* top-level functions such as move_sum */ -#define MOVE_MAIN(name, ddof) \ +#define MOVE_MAIN(name, has_ddof, has_quantile) \ static PyObject * \ name(PyObject *self, PyObject *args, PyObject *kwds) \ { \ return mover(#name, \ - args, \ - kwds, \ - name##_float64, \ - name##_float32, \ - name##_int64, \ - name##_int32, \ - ddof); \ + args, \ + kwds, \ + name##_float64, \ + name##_float32, \ + name##_int64, \ + name##_int32, \ + has_ddof, \ + has_quantile); \ } +/* Mover function */ +#define MOVER(name, args, kwds, has_ddof, has_quantile) \ + return mover(#name, \ + args, \ + kwds, \ + name##_float64, \ + name##_float32, \ + name##_int64, \ + name##_int32, \ + has_ddof, \ + has_quantile); + + /* typedefs and prototypes ----------------------------------------------- */ /* used by move_min and move_max */ @@ -57,7 +72,7 @@ struct _pairs { typedef struct _pairs pairs; /* function pointer for functions passed to mover */ -typedef PyObject *(*move_t)(PyArrayObject *, int, int, int, int); +typedef PyObject *(*move_t)(PyArrayObject *, int, int, int, int, double); static PyObject * mover(char *name, @@ -67,7 +82,8 @@ mover(char *name, move_t, move_t, move_t, - int has_ddof); + int has_ddof, + int has_quantile); /* move_sum -------------------------------------------------------------- */ @@ -148,7 +164,7 @@ MOVE(move_sum, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_sum, 0) +MOVE_MAIN(move_sum, 0, 0) /* move_mean -------------------------------------------------------------- */ @@ -234,7 +250,7 @@ MOVE(move_mean, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_mean, 0) +MOVE_MAIN(move_mean, 0, 0) /* move_std, move_var ---------------------------------------------------- */ @@ -373,7 +389,7 @@ MOVE(NAME, DTYPE0) { } /* dtype end */ -MOVE_MAIN(NAME, 1) +MOVE_MAIN(NAME, 1, 0) /* repeat end */ @@ -535,7 +551,7 @@ MOVE(NAME, DTYPE0) { } /* dtype end */ -MOVE_MAIN(NAME, 0) +MOVE_MAIN(NAME, 0, 0) /* repeat end */ /* move_median ----------------------------------------------------------- */ @@ -598,11 +614,13 @@ MOVE(move_median, DTYPE0) { ai = AI(DTYPE0); YI(DTYPE1) = mm_update_init(mm, ai); } + mm_update_statistic_function(mm); WHILE2 { ai = AI(DTYPE0); YI(DTYPE1) = mm_update(mm, ai); } mm_reset(mm); + mm_reset_statistic_function(mm); NEXT2 } mm_free(mm); @@ -612,7 +630,83 @@ MOVE(move_median, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_median, 0) +MOVE_MAIN(move_median, 0, 0) + +/* move_quantile ----------------------------------------------------------- */ + +/* dtype = [['float64'], ['float32']] */ +MOVE(move_quantile, DTYPE0) { + npy_DTYPE0 ai; + mm_handle *mq = mq_new_nan(window, min_count, quantile); + INIT(NPY_DTYPE0) + if (window == 1) { + mm_free(mq); + return PyArray_Copy(a); + } + if (mq == NULL) { + MEMORY_ERR("Could not allocate memory for move_quantile"); + } + BN_BEGIN_ALLOW_THREADS + WHILE { + WHILE0 { + ai = AI(DTYPE0); + YI(DTYPE0) = mm_update_init_nan(mq, ai); + } + WHILE1 { + ai = AI(DTYPE0); + YI(DTYPE0) = mm_update_init_nan(mq, ai); + } + WHILE2 { + ai = AI(DTYPE0); + YI(DTYPE0) = mm_update_nan(mq, ai); + } + mm_reset(mq); + NEXT2 + } + mm_free(mq); + BN_END_ALLOW_THREADS + return y; +} +/* dtype end */ + +/* dtype = [['int64', 'float64'], ['int32', 'float64']] */ +MOVE(move_quantile, DTYPE0) { + npy_DTYPE0 ai; + mm_handle *mq = mq_new(window, min_count, quantile); + INIT(NPY_DTYPE1) + if (window == 1) { + return PyArray_CastToType(a, + PyArray_DescrFromType(NPY_DTYPE1), + PyArray_CHKFLAGS(a, NPY_ARRAY_F_CONTIGUOUS)); + } + if (mq == NULL) { + MEMORY_ERR("Could not allocate memory for move_quantile"); + } + BN_BEGIN_ALLOW_THREADS + WHILE { + WHILE0 { + ai = AI(DTYPE0); + YI(DTYPE1) = mm_update_init(mq, ai); + } + WHILE1 { + ai = AI(DTYPE0); + YI(DTYPE1) = mm_update_init(mq, ai); + } + WHILE2 { + ai = AI(DTYPE0); + YI(DTYPE1) = mm_update(mq, ai); + } + mm_reset(mq); + NEXT2 + } + mm_free(mq); + BN_END_ALLOW_THREADS + return y; +} +/* dtype end */ + + +MOVE_MAIN(move_quantile, 0, 1) /* move_rank-------------------------------------------------------------- */ @@ -738,7 +832,7 @@ MOVE(move_rank, DTYPE0) { /* dtype end */ -MOVE_MAIN(move_rank, 0) +MOVE_MAIN(move_rank, 0, 0) /* python strings -------------------------------------------------------- */ @@ -748,6 +842,7 @@ PyObject *pystr_window = NULL; PyObject *pystr_min_count = NULL; PyObject *pystr_axis = NULL; PyObject *pystr_ddof = NULL; +PyObject *pystr_q = NULL; static int intern_strings(void) { @@ -756,116 +851,134 @@ intern_strings(void) { pystr_min_count = PyString_InternFromString("min_count"); pystr_axis = PyString_InternFromString("axis"); pystr_ddof = PyString_InternFromString("ddof"); + pystr_q = PyString_InternFromString("q"); return pystr_a && pystr_window && pystr_min_count && - pystr_axis && pystr_ddof; + pystr_axis && pystr_ddof && pystr_q; } /* mover ----------------------------------------------------------------- */ +/* helper function to set a keyword argument */ +static inline short +set_kw_argument(PyObject **var, + PyObject **key_ptr, + PyObject **value_ptr, + PyObject **string_ptr, + short *set_var, + short condition) { + if (PyObject_RichCompareBool(*key_ptr, *string_ptr, Py_EQ)) { + if (*set_var) { + TYPE_ERR("Repeated argument was passed!"); + return 0; + } else if (!condition) { + TYPE_ERR("Keyword argument not supported!"); + return 0; + } + *var = *value_ptr; + *set_var = 1; + return 1; + } + return 2; + +} + +#define CHECK_STATUS(status) \ + if (!status) { return 0; } else if (status == 1) { continue; } + +/* helper function to set a non-keyword argument */ +static inline short +set_argument(PyObject **var, + PyObject *args, + short *set_var, + short condition, + int nargs, + short *counter) { + if (*counter && condition) { + if (*set_var) { + TYPE_ERR("Repeated argument was passed!"); + return 0; + } + *var = PyTuple_GET_ITEM(args, nargs - *counter); + *counter -= 1; + *set_var = 1; + return 1; + } + return 1; +} + static inline int parse_args(PyObject *args, PyObject *kwds, int has_ddof, + int has_quantile, PyObject **a, PyObject **window, PyObject **min_count, PyObject **axis, - PyObject **ddof) { + PyObject **ddof, + PyObject **q) { const Py_ssize_t nargs = PyTuple_GET_SIZE(args); const Py_ssize_t nkwds = kwds == NULL ? 0 : PyDict_Size(kwds); + + if (nargs + nkwds > 4 + has_ddof + has_quantile) { + TYPE_ERR("Too many arguments"); + return 0; + } + + PyObject *key; + PyObject *value; + Py_ssize_t pos = 0; + + short set_a = 0, + set_window = 0, + set_min_count = 0, + set_axis = 0, + set_ddof = 0, + set_q = 0; + if (nkwds) { - int nkwds_found = 0; - PyObject *tmp; - switch (nargs) { - case 4: - if (has_ddof) { - *axis = PyTuple_GET_ITEM(args, 3); - } else { - TYPE_ERR("wrong number of arguments"); - return 0; - } - case 3: *min_count = PyTuple_GET_ITEM(args, 2); - case 2: *window = PyTuple_GET_ITEM(args, 1); - case 1: *a = PyTuple_GET_ITEM(args, 0); - case 0: break; - default: - TYPE_ERR("wrong number of arguments"); - return 0; - } - switch (nargs) { - case 0: - *a = PyDict_GetItem(kwds, pystr_a); - if (*a == NULL) { - TYPE_ERR("Cannot find `a` keyword input"); - return 0; - } - nkwds_found += 1; - case 1: - *window = PyDict_GetItem(kwds, pystr_window); - if (*window == NULL) { - TYPE_ERR("Cannot find `window` keyword input"); - return 0; - } - nkwds_found++; - case 2: - tmp = PyDict_GetItem(kwds, pystr_min_count); - if (tmp != NULL) { - *min_count = tmp; - nkwds_found++; - } - case 3: - tmp = PyDict_GetItem(kwds, pystr_axis); - if (tmp != NULL) { - *axis = tmp; - nkwds_found++; - } - case 4: - if (has_ddof) { - tmp = PyDict_GetItem(kwds, pystr_ddof); - if (tmp != NULL) { - *ddof = tmp; - nkwds_found++; - } - break; - } - break; - default: - TYPE_ERR("wrong number of arguments"); - return 0; - } - if (nkwds_found != nkwds) { - TYPE_ERR("wrong number of keyword arguments"); - return 0; - } - if (nargs + nkwds_found > 4 + has_ddof) { - TYPE_ERR("too many arguments"); + short status; + while (PyDict_Next(kwds, &pos, &key, &value)) { + status = set_kw_argument(a, &key, &value, &pystr_a, &set_a, 1); + CHECK_STATUS(status) + status = set_kw_argument(window, &key, &value, &pystr_window, &set_window, 1); + CHECK_STATUS(status) + status = set_kw_argument(min_count, &key, &value, &pystr_min_count, &set_min_count, 1); + CHECK_STATUS(status) + status = set_kw_argument(axis, &key, &value, &pystr_axis, &set_axis, 1); + CHECK_STATUS(status) + status = set_kw_argument(ddof, &key, &value, &pystr_ddof, &set_ddof, has_ddof); + CHECK_STATUS(status) + status = set_kw_argument(q, &key, &value, &pystr_q, &set_q, has_quantile); + CHECK_STATUS(status) + TYPE_ERR("Unsupported keyword argument"); return 0; } - } else { - switch (nargs) { - case 5: - if (has_ddof) { - *ddof = PyTuple_GET_ITEM(args, 4); - } else { - TYPE_ERR("wrong number of arguments"); - return 0; - } - case 4: - *axis = PyTuple_GET_ITEM(args, 3); - case 3: - *min_count = PyTuple_GET_ITEM(args, 2); - case 2: - *window = PyTuple_GET_ITEM(args, 1); - *a = PyTuple_GET_ITEM(args, 0); - break; - default: - TYPE_ERR("wrong number of arguments"); - return 0; - } } - return 1; + short args_not_found = nargs; + if (!set_argument(a, args, &set_a, 1, nargs, &args_not_found)) return 0; + if (!set_argument(window, args, &set_window, 1, nargs, &args_not_found)) return 0; + if (!set_argument(min_count, args, &set_min_count, 1, nargs, &args_not_found)) return 0; + if (!set_argument(axis, args, &set_axis, 1, nargs, &args_not_found)) return 0; + if (!set_argument(ddof, args, &set_ddof, has_ddof, nargs, &args_not_found)) return 0; + if (!set_argument(q, args, &set_q, has_quantile, nargs, &args_not_found)) return 0; + + if (args_not_found) { + TYPE_ERR("wrong number of arguments"); + return 0; + } + if (*a == NULL) { + TYPE_ERR("Cannot find `a` argument"); + return 0; + } + if (*window == NULL) { + TYPE_ERR("Cannot find `window` argument"); + return 0; + } + + return 1; } @@ -877,10 +990,12 @@ mover(char *name, move_t move_float32, move_t move_int64, move_t move_int32, - int has_ddof) { + int has_ddof, + int has_quantile) { int mc; int window; + double quantile; int axis; int ddof; int dtype; @@ -894,14 +1009,49 @@ mover(char *name, PyObject *a_obj = NULL; PyObject *window_obj = NULL; PyObject *min_count_obj = Py_None; + PyObject *quantile_obj = Py_None; PyObject *axis_obj = NULL; PyObject *ddof_obj = NULL; - if (!parse_args(args, kwds, has_ddof, &a_obj, &window_obj, - &min_count_obj, &axis_obj, &ddof_obj)) { + if (!parse_args(args, kwds, has_ddof, has_quantile, &a_obj, &window_obj, + &min_count_obj, &axis_obj, &ddof_obj, &quantile_obj)) { return NULL; } + /* quantile + * Checking quantile first because if q in {0, 0.5, 1} then + * another `mover` function is called. Check the quantile + * first to avoid converting (if needed) `a` to an array twice + */ + + quantile = (double) 0.5; + + if ((has_quantile) && (!strcmp(name, "move_quantile"))) { + if (quantile_obj != Py_None) { + quantile = PyFloat_AsDouble(quantile_obj); + if (error_converting(quantile)) { + TYPE_ERR("Value(s) in `q` must be float"); + return NULL; + } + if ((quantile < 0.0) || (quantile > 1.0)) { + /* Float/double specifiers %f and %lf don't work here for some reason*/ + PyErr_Format(PyExc_ValueError, + "Value(s) in `q` must be between 0. and 1."); + return NULL; + } + + if (quantile == 1.0) { + MOVER(move_max, args, kwds, has_ddof, 1) + } else if (quantile == 0.0) { + MOVER(move_min, args, kwds, has_ddof, 1) + } + } + + if (quantile == 0.5) { + MOVER(move_median, args, kwds, has_ddof, 1) + } + } + /* convert to array if necessary */ if (PyArray_Check(a_obj)) { a = (PyArrayObject *)a_obj; @@ -998,13 +1148,13 @@ mover(char *name, dtype = PyArray_TYPE(a); if (dtype == NPY_float64) { - y = move_float64(a, window, mc, axis, ddof); + y = move_float64(a, window, mc, axis, ddof, quantile); } else if (dtype == NPY_float32) { - y = move_float32(a, window, mc, axis, ddof); + y = move_float32(a, window, mc, axis, ddof, quantile); } else if (dtype == NPY_int64) { - y = move_int64(a, window, mc, axis, ddof); + y = move_int64(a, window, mc, axis, ddof, quantile); } else if (dtype == NPY_int32) { - y = move_int32(a, window, mc, axis, ddof); + y = move_int32(a, window, mc, axis, ddof, quantile); } else { y = slow(name, args, kwds); } @@ -1427,6 +1577,48 @@ array([ 1. , 1.5, 2.5, 3.5]) MULTILINE STRING END */ +static char move_quantile_doc[] = +/* MULTILINE STRING BEGIN +move_quantile(a, window, q, min_count=None, axis=-1) + +Moving window quantile along the specified axis, ignoring NaNs. + +float64 output is returned for all input data types. + +Interpolation method for the quantile is `midpoint`. + +Parameters +---------- +a : ndarray + Input array. If `a` is not an array, a conversion is attempted. +window : int + The number of elements in the moving window. +q : float or list of floats + Quantile(s) to compute, all values must be between 0 and 1 inclusive. +min_count: {int, None}, optional + If the number of non-NaN values in a window is less than `min_count`, + then a value of NaN is assigned to the window. By default `min_count` + is None, which is equivalent to setting `min_count` equal to `window`. +axis : int, optional + The axis over which the window is moved. By default the last axis + (axis=-1) is used. An axis of None is not allowed. + +Returns +------- +y : ndarray + The moving quantile of the input array along the specified axis. The + output has the same shape as the input. + +Examples +-------- +>>> a = np.array([1.0, np.nan, 3.0, 4.0, 5.0, 6.0, 7.0]) +>>> bn.move_quantile(a, window=4, q=0.3) +array([nan, nan, nan, nan, nan, 3.5, 4.5]) +>>> bn.move_quantile(a, window=4, q=0.3, min_count=3) +array([nan, nan, nan, 2. , 3.5, 3.5, 4.5]) + +MULTILINE STRING END */ + static char move_rank_doc[] = /* MULTILINE STRING BEGIN move_rank(a, window, min_count=None, axis=-1) @@ -1490,16 +1682,17 @@ MULTILINE STRING END */ static PyMethodDef move_methods[] = { - {"move_sum", (PyCFunction)move_sum, VARKEY, move_sum_doc}, - {"move_mean", (PyCFunction)move_mean, VARKEY, move_mean_doc}, - {"move_std", (PyCFunction)move_std, VARKEY, move_std_doc}, - {"move_var", (PyCFunction)move_var, VARKEY, move_var_doc}, - {"move_min", (PyCFunction)move_min, VARKEY, move_min_doc}, - {"move_max", (PyCFunction)move_max, VARKEY, move_max_doc}, - {"move_argmin", (PyCFunction)move_argmin, VARKEY, move_argmin_doc}, - {"move_argmax", (PyCFunction)move_argmax, VARKEY, move_argmax_doc}, - {"move_median", (PyCFunction)move_median, VARKEY, move_median_doc}, - {"move_rank", (PyCFunction)move_rank, VARKEY, move_rank_doc}, + {"move_sum", (PyCFunction)move_sum, VARKEY, move_sum_doc}, + {"move_mean", (PyCFunction)move_mean, VARKEY, move_mean_doc}, + {"move_std", (PyCFunction)move_std, VARKEY, move_std_doc}, + {"move_var", (PyCFunction)move_var, VARKEY, move_var_doc}, + {"move_min", (PyCFunction)move_min, VARKEY, move_min_doc}, + {"move_max", (PyCFunction)move_max, VARKEY, move_max_doc}, + {"move_argmin", (PyCFunction)move_argmin, VARKEY, move_argmin_doc}, + {"move_argmax", (PyCFunction)move_argmax, VARKEY, move_argmax_doc}, + {"move_median", (PyCFunction)move_median, VARKEY, move_median_doc}, + {"move_quantile", (PyCFunction)move_quantile, VARKEY, move_quantile_doc}, + {"move_rank", (PyCFunction)move_rank, VARKEY, move_rank_doc}, {NULL, NULL, 0, NULL} }; diff --git a/bottleneck/tests/input_modification_test.py b/bottleneck/tests/input_modification_test.py index 1546d81828..3ae33907e9 100644 --- a/bottleneck/tests/input_modification_test.py +++ b/bottleneck/tests/input_modification_test.py @@ -50,6 +50,10 @@ def test_modification(func): if "partition" in name: second_arg = 0 + kwargs = {} + if name == "move_quantile": + kwargs['q'] = 0.5 + for axis in axes: with np.errstate(invalid="ignore"): a1 = a.copy() @@ -57,7 +61,7 @@ def test_modification(func): if any(x in name for x in ["move", "sort", "partition"]): with warnings.catch_warnings(): warnings.simplefilter("ignore") - func(a1, second_arg, axis=axis) + func(a1, second_arg, axis=axis, **kwargs) else: try: with warnings.catch_warnings(): diff --git a/bottleneck/tests/list_input_test.py b/bottleneck/tests/list_input_test.py index 4306a23990..3fd570b299 100644 --- a/bottleneck/tests/list_input_test.py +++ b/bottleneck/tests/list_input_test.py @@ -34,6 +34,9 @@ def test_list_input(func): name = func.__name__ if name == "replace": return + kwargs = {} + if name == "move_quantile": + kwargs['q'] = 0.5 func0 = eval("bn.slow.%s" % name) for i, a in enumerate(lists()): with warnings.catch_warnings(): @@ -42,8 +45,8 @@ def test_list_input(func): actual = func(a) desired = func0(a) except TypeError: - actual = func(a, 2) - desired = func0(a, 2) + actual = func(a, 2, **kwargs) + desired = func0(a, 2, **kwargs) a = np.array(a) tup = (name, "a" + str(i), str(a.dtype), str(a.shape), a) err_msg = msg % tup diff --git a/bottleneck/tests/move_test.py b/bottleneck/tests/move_test.py index 26fe36f843..6bd8ac2e64 100644 --- a/bottleneck/tests/move_test.py +++ b/bottleneck/tests/move_test.py @@ -5,6 +5,8 @@ import bottleneck as bn from .util import arrays, array_order import pytest +import itertools +import warnings @pytest.mark.parametrize("func", bn.get_functions("move"), ids=lambda x: x.__name__) @@ -22,6 +24,9 @@ def test_move(func): decimal = 3 else: decimal = 5 + quantiles = [1.] + if func_name == "move_quantile": + quantiles = [0.33, 0.67] for i, a in enumerate(arrays(func_name)): axes = range(-1, a.ndim) for axis in axes: @@ -29,25 +34,29 @@ def test_move(func): for window in windows: min_counts = list(range(1, window + 1)) + [None] for min_count in min_counts: - actual = func(a, window, min_count, axis=axis) - desired = func0(a, window, min_count, axis=axis) - tup = ( - func_name, - window, - str(min_count), - "a" + str(i), - str(a.dtype), - str(a.shape), - str(axis), - array_order(a), - a, - ) - err_msg = fmt % tup - aaae(actual, desired, decimal, err_msg) - err_msg += "\n dtype mismatch %s %s" - da = actual.dtype - dd = desired.dtype - assert_equal(da, dd, err_msg % (da, dd)) + for q in quantiles: + kwargs = {} + if func_name == "move_quantile": + kwargs = {"q" : q} + actual = func(a, window=window, min_count=min_count, axis=axis, **kwargs) + desired = func0(a, window=window, min_count=min_count, axis=axis, **kwargs) + tup = ( + func_name, + window, + str(min_count), + "a" + str(i), + str(a.dtype), + str(a.shape), + str(axis), + array_order(a), + a, + ) + err_msg = fmt % tup + aaae(actual, desired, decimal, err_msg) + err_msg += "\n dtype mismatch %s %s" + da = actual.dtype + dd = desired.dtype + assert_equal(da, dd, err_msg % (da, dd)) # --------------------------------------------------------------------------- @@ -59,6 +68,11 @@ def test_arg_parsing(func, decimal=5): """test argument parsing.""" name = func.__name__ + + if name == "move_quantile": + from ..move import move_quantile as move_quantile_c + func = move_quantile_c + func0 = eval("bn.slow.%s" % name) a = np.array([1.0, 2, 3]) @@ -107,12 +121,34 @@ def test_arg_parsing(func, decimal=5): err_msg = fmt % "(a=a, axis=-1, min_count=None, window=2)" assert_array_almost_equal(actual, desired, decimal, err_msg) + actual = func(a=a, axis=-1, min_count=None, window=2) + desired = func0(a=a, axis=-1, min_count=None, window=2) + err_msg = fmt % "(a=a, axis=-1, min_count=None, window=2)" + assert_array_almost_equal(actual, desired, decimal, err_msg) + if name in ("move_std", "move_var"): actual = func(a, 2, 1, -1, ddof=1) desired = func0(a, 2, 1, -1, ddof=1) err_msg = fmt % "(a, 2, 1, -1, ddof=1)" assert_array_almost_equal(actual, desired, decimal, err_msg) + if name == "move_quantile": + q = 0.3 + actual = func(q=q, axis=-1, a=a, min_count=None, window=2) + desired = func0(q=q, axis=-1, a=a, min_count=None, window=2) + err_msg = fmt % "(q=q, axis=-1, a=a, min_count=None, window=2)" + assert_array_almost_equal(actual, desired, decimal, err_msg) + + actual = func(a, axis=-1, q=q, window=2, min_count=None) + desired = func0(a, axis=-1, q=q, window=2, min_count=None) + err_msg = fmt % "(a, axis=-1, q=q, window=2, min_count=None)" + assert_array_almost_equal(actual, desired, decimal, err_msg) + + actual = func(axis=-1, a=a, q=q, window=2) + desired = func0(axis=-1, a=a, q=q, window=2) + err_msg = fmt % "(axis=-1, a=a, q=q, window=2)" + assert_array_almost_equal(actual, desired, decimal, err_msg) + # regression test: make sure len(kwargs) == 0 doesn't raise args = (a, 1, 1, -1) kwargs = {} @@ -142,28 +178,29 @@ def test_arg_parse_raises(func): # increase size to 30. With those two changes the unit tests will take a # LONG time to run. +REPEAT_MEDIAN = 10 def test_move_median_with_nans(): """test move_median.c with nans""" fmt = "\nfunc %s | window %d | min_count %s\n\nInput array:\n%s\n" aaae = assert_array_almost_equal min_count = 1 - size = 10 func = bn.move_median func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) - for i in range(100): - a = np.arange(size, dtype=np.float64) - idx = rs.rand(*a.shape) < 0.1 - a[idx] = np.inf - idx = rs.rand(*a.shape) < 0.2 - a[idx] = np.nan - rs.shuffle(a) - for window in range(2, size + 1): - actual = func(a, window=window, min_count=min_count) - desired = func0(a, window=window, min_count=min_count) - err_msg = fmt % (func.__name__, window, min_count, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) + for size in [1, 2, 3, 4, 5, 9, 10, 19, 20, 21]: + for _ in range(REPEAT_MEDIAN): + a = np.arange(size, dtype=np.float64) + idx = rs.rand(*a.shape) < 0.1 + a[idx] = np.inf + idx = rs.rand(*a.shape) < 0.2 + a[idx] = np.nan + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count) + desired = func0(a, window=window, min_count=min_count) + err_msg = fmt % (func.__name__, window, min_count, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) def test_move_median_without_nans(): @@ -171,18 +208,186 @@ def test_move_median_without_nans(): fmt = "\nfunc %s | window %d | min_count %s\n\nInput array:\n%s\n" aaae = assert_array_almost_equal min_count = 1 - size = 10 func = bn.move_median func0 = bn.slow.move_median rs = np.random.RandomState([1, 2, 3]) - for i in range(100): - a = np.arange(size, dtype=np.int64) - rs.shuffle(a) - for window in range(2, size + 1): - actual = func(a, window=window, min_count=min_count) - desired = func0(a, window=window, min_count=min_count) - err_msg = fmt % (func.__name__, window, min_count, a) - aaae(actual, desired, decimal=5, err_msg=err_msg) + for size in [1, 2, 3, 4, 5, 9, 10, 19, 20, 21]: + for _ in range(REPEAT_MEDIAN): + a = np.arange(size, dtype=np.int64) + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count) + desired = func0(a, window=window, min_count=min_count) + err_msg = fmt % (func.__name__, window, min_count, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + + +# --------------------------------------------------------------------------- +# move_quantile is newly added. So let's do (very) extensive testing +# +# Unfortunately, np.nanmedian(a) and np.nanquantile(a, q=0.5) don't always agree +# when a contains inf or -inf values. See for instance: +# https://github.com/numpy/numpy/issues/21932 +# https://github.com/numpy/numpy/issues/21091 +# +# Let's first test without inf and -inf. +# When there are no infs in data, bn.slow.move_quantile calls +# move_func for np_nanquantile_infs, which just runs +# np.nanquantile with interpolation="midpoint" + +REPEAT_QUANTILE = 10 + +def test_move_quantile_with_nans(): + """test move_quantile.c with nans""" + fmt = "\nfunc %s | window %d | min_count %s | q %f\n\nInput array:\n%s\n" + aaae = assert_array_almost_equal + min_count = 1 + size = 10 + func = bn.move_quantile + func0 = bn.slow.move_quantile + rs = np.random.RandomState([1, 2, 3]) + for size in [1, 2, 3, 5, 9, 10, 13, 16]: + for _ in range(REPEAT_QUANTILE): + for nan_frac in [0.2, 0.5, 0.7, 1.]: + # test more variants of arrays (ints, floats, diff values) + arrays = [np.arange(size, dtype=np.float64), + (rs.random(size) - 0.5) * rs.randint(0, 100_000), + (rs.random(size) - 0.5) / rs.randint(0, 100_000)] + for a in arrays: + # q = 0. and 1. are important edge cases. We call existing + # move_min and move_max for these, but still must check + # that it works properly + for q in [0., 1., rs.rand()]: + idx = rs.rand(*a.shape) < nan_frac + a[idx] = np.nan + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + +def test_move_quantile_without_nans(): + """test move_quantile.c without nans""" + fmt = "\nfunc %s | window %d | min_count %s | q %f\n\nInput array:\n%s\n" + aaae = assert_array_almost_equal + min_count = 1 + size = 10 + func = bn.move_quantile + func0 = bn.slow.move_quantile + rs = np.random.RandomState([1, 2, 3]) + for size in [1, 2, 3, 5, 9, 10, 13, 16]: + for _ in range(REPEAT_QUANTILE): + # test more variants of arrays (ints, floats, diff values) + arrays = [np.arange(size, dtype=np.float64), + (rs.random(size) - 0.5) * rs.randint(0, 100_000), + (rs.random(size) - 0.5) / rs.randint(0, 100_000)] + for a in arrays: + # q = 0. and 1. are important edge cases. We call existing + # move_min and move_max for these, but still must check + # that it works properly + for q in [0., 1., rs.rand()]: + rs.shuffle(a) + for window in range(2, size + 1): + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + + +# Now let's deal with inf ans -infs +# np.nanquantile doesn't give desired results when infs are present, +# due to abmiguities with arithmetic operations with infs. +# For instance, +# np.nanquantile([np.inf, np.inf], q=0.5, method="midpoint") returns np.nan, +# while +# np.nanmedian([np.inf, np.inf]) returns np.inf, +# although arguably these should give the same result. The behaviour of +# np.nanmedian is also agruably more expected. +# We check that the following will always give the same result as np.nanmedian(a): +# (np.nanquantile(a, q=0.5, method="lower") + np.nanquantile(a, q=0.5, method="higher")) / 2 +# It is also clear that this essentially is the same as method="midpoint". +# The next test verifies this. + +from ..slow.move import np_nanquantile_infs + +REPEAT_NUMPY_QUANTILE = 10 + +def test_numpy_nanquantile_infs(): + """test move_quantile.c with nans""" + fmt = "\nfunc %s \n\nInput array:\n%s\n" + aaae = assert_array_almost_equal + min_count = 1 + func = np.nanmedian + func0 = np_nanquantile_infs + rs = np.random.RandomState([1, 2, 3]) + sizes = [1, 2, 3, 4, 5, 9, 10, 20, 31] + fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] + inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + for size in sizes: + for _ in range(REPEAT_NUMPY_QUANTILE): + for (inf_frac, minus_inf_frac, nan_frac) in inf_minf_nan_fracs: + arrays = [np.arange(size, dtype=np.float64), + (rs.random(size) - 0.5) * rs.randint(0, 100_000), + (rs.random(size) - 0.5) / rs.randint(0, 100_000)] + for a in arrays: + randoms = rs.rand(*a.shape) + idx_nans = randoms < inf_frac + minus_inf_frac + nan_frac + a[idx_nans] = np.nan + idx_minfs = randoms < inf_frac + minus_inf_frac + a[idx_minfs] = -np.inf + idx_infs = randoms < inf_frac + a[idx_infs] = np.inf + rs.shuffle(a) + actual = func(a) + desired = func0(a, q=0.5) + err_msg = fmt % (func.__name__, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + +# This shows that np_nanquantile_infs indeed replicates the +# behaviour of np.nanquantile, while correclty handling infs in data. +# So we use np_nanquantile_infs in our bn.slow.move_quantile for testing + +REPEAT_FULL_QUANTILE = 1 + +def test_move_quantile_with_infs_and_nans(): + """test move_quantile.c with infs and nans""" + fmt = "\nfunc %s | window %d | min_count %s | q %f\n\nInput array:\n%s\n" + aaae = assert_array_almost_equal + func = bn.move_quantile + func0 = bn.slow.move_quantile + rs = np.random.RandomState([1, 2, 3]) + fracs = [0., 0.2, 0.4, 0.6, 0.8, 1.] + inf_minf_nan_fracs = [triple for triple in itertools.product(fracs, fracs, fracs) if np.sum(triple) <= 1] + # for size in [1, 2, 3, 5, 9, 10, 17, 20, 31]: + for size in [1, 2, 3, 5, 9, 10]: + for min_count in [1, 2, 3, size//2, size - 1, size]: + if min_count < 1 or min_count > size: + continue + for (inf_frac, minus_inf_frac, nan_frac) in inf_minf_nan_fracs: + for window in range(min_count, size + 1): + for _ in range(REPEAT_FULL_QUANTILE): + for q in [0., 1., 0.25, 0.75, rs.rand()]: + arrays = [np.arange(size, dtype=np.float64), + (rs.random(size) - 0.5) * rs.randint(1, 100_000), + (rs.random(size) - 0.5) / rs.randint(1, 100_000)] + for a in arrays: + a = np.arange(size, dtype=np.float64) + randoms = rs.rand(*a.shape) + idx_nans = randoms < inf_frac + minus_inf_frac + nan_frac + a[idx_nans] = np.nan + idx_minfs = randoms < inf_frac + minus_inf_frac + a[idx_minfs] = -np.inf + idx_infs = randoms < inf_frac + a[idx_infs] = np.inf + rs.shuffle(a) + actual = func(a, window=window, min_count=min_count, q=q) + desired = func0(a, window=window, min_count=min_count, q=q) + err_msg = fmt % (func.__name__, window, min_count, q, a) + aaae(actual, desired, decimal=5, err_msg=err_msg) + # ---------------------------------------------------------------------------- diff --git a/bottleneck/tests/util.py b/bottleneck/tests/util.py index cd19d63042..c94faa2e28 100644 --- a/bottleneck/tests/util.py +++ b/bottleneck/tests/util.py @@ -48,6 +48,7 @@ def func_dict(): bn.move_argmin, bn.move_argmax, bn.move_median, + bn.move_quantile, bn.move_rank, ] d["nonreduce"] = [bn.replace] diff --git a/doc/source/reference.rst b/doc/source/reference.rst index a3ea589218..ecd7b17559 100644 --- a/doc/source/reference.rst +++ b/doc/source/reference.rst @@ -23,7 +23,8 @@ moving window :meth:`move_sum `, :meth :meth:`move_std `, :meth:`move_var `, :meth:`move_min `, :meth:`move_max `, :meth:`move_argmin `, :meth:`move_argmax `, - :meth:`move_median `, :meth:`move_rank ` + :meth:`move_median `, :meth:`move_quantile `, + :meth:`move_rank ` ================================= ============================================================================================== @@ -167,5 +168,9 @@ Functions that operate along a (1d) moving window. ------------ +.. autofunction:: bottleneck.move_quantile + +------------ + .. autofunction:: bottleneck.move_rank