From fced5b1475e9efd1cd2e26a7cf5795d9cc63cec5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Nov 2019 10:11:03 -0800 Subject: [PATCH] CLN: assorted, mostly typing (#29419) --- pandas/_libs/algos.pyx | 71 ++++++++++++++++++++++++ pandas/_libs/algos_common_helper.pxi.in | 71 ------------------------ pandas/_libs/missing.pyx | 6 +-- pandas/_libs/window.pyx | 4 +- pandas/core/base.py | 8 +-- pandas/core/groupby/categorical.py | 4 +- pandas/core/groupby/generic.py | 72 +++++++++++++------------ pandas/core/groupby/groupby.py | 56 +++++++++++-------- pandas/core/groupby/ops.py | 46 +++++++++------- pandas/core/internals/concat.py | 14 ++--- pandas/core/reshape/concat.py | 27 +++++----- pandas/core/reshape/melt.py | 8 +-- pandas/core/reshape/merge.py | 26 +++++---- pandas/core/reshape/reshape.py | 2 +- pandas/core/window/common.py | 2 +- pandas/core/window/rolling.py | 19 ++++--- pandas/tseries/offsets.py | 8 +-- 17 files changed, 238 insertions(+), 206 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a08ae66865e20..2d6c8e1008ce1 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1150,6 +1150,77 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', return ranks +ctypedef fused diff_t: + float64_t + float32_t + int8_t + int16_t + int32_t + int64_t + +ctypedef fused out_t: + float32_t + float64_t + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d(ndarray[diff_t, ndim=2] arr, + ndarray[out_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy, start, stop + bint f_contig = arr.flags.f_contiguous + + # Disable for unsupported dtype combinations, + # see https://github.com/cython/cython/issues/2646 + if (out_t is float32_t + and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): + raise NotImplementedError + elif (out_t is float64_t + and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): + raise NotImplementedError + else: + # We put this inside an indented else block to avoid cython build + # warnings about unreachable code + sx, sy = (arr).shape + with nogil: + if f_contig: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + # generated from template include "algos_common_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index ea05c4afc8fce..5bfc594602dd8 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -4,77 +4,6 @@ Template for each `dtype` helper function using 1-d template WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -ctypedef fused diff_t: - float64_t - float32_t - int8_t - int16_t - int32_t - int64_t - -ctypedef fused out_t: - float32_t - float64_t - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d(ndarray[diff_t, ndim=2] arr, - ndarray[out_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy, start, stop - bint f_contig = arr.flags.f_contiguous - - # Disable for unsupported dtype combinations, - # see https://github.com/cython/cython/issues/2646 - if (out_t is float32_t - and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): - raise NotImplementedError - elif (out_t is float64_t - and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): - raise NotImplementedError - else: - # We put this inside an indented else block to avoid cython build - # warnings about unreachable code - sx, sy = (arr).shape - with nogil: - if f_contig: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - # ---------------------------------------------------------------------- # ensure_dtype # ---------------------------------------------------------------------- diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 052b081988c9e..9568ddb7fe53f 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -121,7 +121,7 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnaobj_old(ndarray arr): +def isnaobj_old(arr: ndarray) -> ndarray: """ Return boolean mask denoting which elements of a 1-D array are na-like, defined as being any of: @@ -156,7 +156,7 @@ def isnaobj_old(ndarray arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnaobj2d(ndarray arr): +def isnaobj2d(arr: ndarray) -> ndarray: """ Return boolean mask denoting which elements of a 2-D array are na-like, according to the criteria defined in `checknull`: @@ -198,7 +198,7 @@ def isnaobj2d(ndarray arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnaobj2d_old(ndarray arr): +def isnaobj2d_old(arr: ndarray) -> ndarray: """ Return boolean mask denoting which elements of a 2-D array are na-like, according to the criteria defined in `checknull_old`: diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index d1adc7789a7a3..b51d61d05ce98 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -69,8 +69,8 @@ def _check_minp(win, minp, N, floor=None) -> int: if not util.is_integer_object(minp): raise ValueError("min_periods must be an integer") if minp > win: - raise ValueError("min_periods (%d) must be <= " - "window (%d)" % (minp, win)) + raise ValueError("min_periods (minp) must be <= " + "window (win)".format(minp=minp, win=win)) elif minp > N: minp = N + 1 elif minp < 0: diff --git a/pandas/core/base.py b/pandas/core/base.py index 1a2f906f97152..0e088a381e964 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -207,7 +207,7 @@ def _selected_obj(self): return self.obj[self._selection] @cache_readonly - def ndim(self): + def ndim(self) -> int: return self._selected_obj.ndim @cache_readonly @@ -339,7 +339,7 @@ def _aggregate(self, arg, *args, **kwargs): obj = self._selected_obj - def nested_renaming_depr(level=4): + def nested_renaming_depr(level: int = 4): # deprecation of nested renaming # GH 15931 msg = textwrap.dedent( @@ -488,11 +488,11 @@ def _agg(arg, func): # combine results - def is_any_series(): + def is_any_series() -> bool: # return a boolean if we have *any* nested series return any(isinstance(r, ABCSeries) for r in result.values()) - def is_any_frame(): + def is_any_frame() -> bool: # return a boolean if we have *any* nested series return any(isinstance(r, ABCDataFrame) for r in result.values()) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index fcf52ecfcbbcd..399ed9ddc9ba1 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -8,7 +8,7 @@ ) -def recode_for_groupby(c, sort, observed): +def recode_for_groupby(c: Categorical, sort: bool, observed: bool): """ Code the categories to ensure we can groupby for categoricals. @@ -74,7 +74,7 @@ def recode_for_groupby(c, sort, observed): return c.reorder_categories(cat.categories), None -def recode_from_groupby(c, sort, ci): +def recode_from_groupby(c: Categorical, sort: bool, ci): """ Reverse the codes_to_groupby to account for sort / observed. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1e38dde2096ba..8512b6c3ae530 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,6 +21,7 @@ Tuple, Type, Union, + cast, ) import warnings @@ -369,7 +370,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # GH #6265 return Series([], name=self._selection_name, index=keys) - def _get_index(): + def _get_index() -> Index: if self.grouper.nkeys > 1: index = MultiIndex.from_tuples(keys, names=self.grouper.names) else: @@ -462,7 +463,7 @@ def transform(self, func, *args, **kwargs): result.index = self._selected_obj.index return result - def _transform_fast(self, func, func_nm): + def _transform_fast(self, func, func_nm) -> Series: """ fast version of transform, only applicable to builtin/cythonizable functions @@ -512,7 +513,7 @@ def filter(self, func, dropna=True, *args, **kwargs): wrapper = lambda x: func(x, *args, **kwargs) # Interpret np.nan as False. - def true_and_notna(x, *args, **kwargs): + def true_and_notna(x, *args, **kwargs) -> bool: b = wrapper(x, *args, **kwargs) return b and notna(b) @@ -526,7 +527,7 @@ def true_and_notna(x, *args, **kwargs): filtered = self._apply_filter(indices, dropna) return filtered - def nunique(self, dropna=True): + def nunique(self, dropna: bool = True) -> Series: """ Return number of unique elements in the group. @@ -719,7 +720,7 @@ def value_counts( out = ensure_int64(out) return Series(out, index=mi, name=self._selection_name) - def count(self): + def count(self) -> Series: """ Compute count of group, excluding missing values. @@ -768,8 +769,6 @@ class DataFrameGroupBy(GroupBy): _apply_whitelist = base.dataframe_apply_whitelist - _block_agg_axis = 1 - _agg_see_also_doc = dedent( """ See Also @@ -944,19 +943,21 @@ def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield label, values - def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): + def _cython_agg_general( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ): new_items, new_blocks = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) return self._wrap_agged_blocks(new_items, new_blocks) - _block_agg_axis = 0 - - def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): + def _cython_agg_blocks( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ): # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine - data, agg_axis = self._get_data_to_aggregate() + data = self._get_data_to_aggregate() if numeric_only: data = data.get_numeric_data(copy=False) @@ -971,7 +972,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis, min_count=min_count + block.values, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False @@ -1000,12 +1001,13 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # continue and exclude the block deleted_items.append(locs) continue - - # unwrap DataFrame to get array - assert len(result._data.blocks) == 1 - result = result._data.blocks[0].values - if result.ndim == 1 and isinstance(result, np.ndarray): - result = result.reshape(1, -1) + else: + result = cast(DataFrame, result) + # unwrap DataFrame to get array + assert len(result._data.blocks) == 1 + result = result._data.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) finally: assert not isinstance(result, DataFrame) @@ -1081,11 +1083,11 @@ def _aggregate_frame(self, func, *args, **kwargs): return self._wrap_frame_output(result, obj) - def _aggregate_item_by_item(self, func, *args, **kwargs): + def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: # only for axis==0 obj = self._obj_with_exclusions - result = OrderedDict() + result = OrderedDict() # type: dict cannot_agg = [] errors = None for item in obj: @@ -1291,12 +1293,12 @@ def first_not_none(values): # values are not series or array-like but scalars else: # only coerce dates if we find at least 1 datetime - coerce = any(isinstance(x, Timestamp) for x in values) + should_coerce = any(isinstance(x, Timestamp) for x in values) # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns return Series(values, index=key_index)._convert( - datetime=True, coerce=coerce + datetime=True, coerce=should_coerce ) else: @@ -1391,7 +1393,7 @@ def transform(self, func, *args, **kwargs): return self._transform_fast(result, obj, func) - def _transform_fast(self, result, obj, func_nm): + def _transform_fast(self, result: DataFrame, obj: DataFrame, func_nm) -> DataFrame: """ Fast transform path for aggregations """ @@ -1451,7 +1453,7 @@ def _choose_path(self, fast_path, slow_path, group): return path, res - def _transform_item_by_item(self, obj, wrapper): + def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: # iterate through columns output = {} inds = [] @@ -1536,7 +1538,7 @@ def filter(self, func, dropna=True, *args, **kwargs): return self._apply_filter(indices, dropna) - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, key, ndim: int, subset=None): """ sub-classes to define return a sliced object @@ -1571,7 +1573,7 @@ def _gotitem(self, key, ndim, subset=None): raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj): + def _wrap_frame_output(self, result, obj) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1582,9 +1584,9 @@ def _wrap_frame_output(self, result, obj): def _get_data_to_aggregate(self): obj = self._obj_with_exclusions if self.axis == 1: - return obj.T._data, 1 + return obj.T._data else: - return obj._data, 1 + return obj._data def _insert_inaxis_grouper_inplace(self, result): # zip in reverse so we can always insert at loc 0 @@ -1622,7 +1624,7 @@ def _wrap_aggregated_output(self, output, names=None): return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None): + def _wrap_transformed_output(self, output, names=None) -> DataFrame: return DataFrame(output, index=self.obj.index) def _wrap_agged_blocks(self, items, blocks): @@ -1670,7 +1672,7 @@ def count(self): DataFrame Count of values within each group. """ - data, _ = self._get_data_to_aggregate() + data = self._get_data_to_aggregate() ids, _, ngroups = self.grouper.group_info mask = ids != -1 @@ -1687,7 +1689,7 @@ def count(self): return self._wrap_agged_blocks(data.items, list(blk)) - def nunique(self, dropna=True): + def nunique(self, dropna: bool = True): """ Return DataFrame with number of distinct observations per group for each column. @@ -1756,7 +1758,7 @@ def groupby_series(obj, col=None): boxplot = boxplot_frame_groupby -def _is_multi_agg_with_relabel(**kwargs): +def _is_multi_agg_with_relabel(**kwargs) -> bool: """ Check whether kwargs passed to .agg look like multi-agg with relabeling. @@ -1778,7 +1780,9 @@ def _is_multi_agg_with_relabel(**kwargs): >>> _is_multi_agg_with_relabel() False """ - return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and kwargs + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( + len(kwargs) > 0 + ) def _normalize_keyword_aggregation(kwargs): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 59b118431cfc9..873a31e658625 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -756,7 +756,7 @@ def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) - def _cumcount_array(self, ascending=True): + def _cumcount_array(self, ascending: bool = True): """ Parameters ---------- @@ -788,7 +788,7 @@ def _cumcount_array(self, ascending=True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only=False): + def _try_cast(self, result, obj, numeric_only: bool = False): """ Try to cast the result to our obj original type, we may have roundtripped through object in the mean-time. @@ -828,7 +828,7 @@ def _try_cast(self, result, obj, numeric_only=False): return result - def _transform_should_cast(self, func_nm): + def _transform_should_cast(self, func_nm: str) -> bool: """ Parameters ---------- @@ -844,8 +844,8 @@ def _transform_should_cast(self, func_nm): func_nm not in base.cython_cast_blacklist ) - def _cython_transform(self, how, numeric_only=True, **kwargs): - output = collections.OrderedDict() + def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): + output = collections.OrderedDict() # type: dict for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: @@ -871,10 +871,12 @@ def _wrap_aggregated_output(self, output, names=None): def _wrap_transformed_output(self, output, names=None): raise AbstractMethodError(self) - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) - def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): + def _cython_agg_general( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -920,7 +922,7 @@ def _python_agg_general(self, func, *args, **kwargs): return self._wrap_aggregated_output(output) - def _concat_objects(self, keys, values, not_indexed_same=False): + def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat def reset_identity(values): @@ -980,10 +982,7 @@ def reset_identity(values): values = reset_identity(values) result = concat(values, axis=self.axis) - if ( - isinstance(result, Series) - and getattr(self, "_selection_name", None) is not None - ): + if isinstance(result, Series) and self._selection_name is not None: result.name = self._selection_name @@ -1104,7 +1103,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: @Substitution(name="groupby") @Appender(_common_see_also) - def any(self, skipna=True): + def any(self, skipna: bool = True): """ Return True if any value in the group is truthful, else False. @@ -1121,7 +1120,7 @@ def any(self, skipna=True): @Substitution(name="groupby") @Appender(_common_see_also) - def all(self, skipna=True): + def all(self, skipna: bool = True): """ Return True if all values in the group are truthful, else False. @@ -1221,7 +1220,7 @@ def median(self, **kwargs): @Substitution(name="groupby") @Appender(_common_see_also) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof: int = 1, *args, **kwargs): """ Compute standard deviation of groups, excluding missing values. @@ -1244,7 +1243,7 @@ def std(self, ddof=1, *args, **kwargs): @Substitution(name="groupby") @Appender(_common_see_also) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof: int = 1, *args, **kwargs): """ Compute variance of groups, excluding missing values. @@ -1272,7 +1271,7 @@ def var(self, ddof=1, *args, **kwargs): @Substitution(name="groupby") @Appender(_common_see_also) - def sem(self, ddof=1): + def sem(self, ddof: int = 1): """ Compute standard error of the mean of groups, excluding missing values. @@ -1313,7 +1312,13 @@ def _add_numeric_operations(cls): Add numeric operations to the GroupBy generically. """ - def groupby_function(name, alias, npfunc, numeric_only=True, min_count=-1): + def groupby_function( + name: str, + alias: str, + npfunc, + numeric_only: bool = True, + min_count: int = -1, + ): _local_template = """ Compute %(f)s of group values. @@ -1403,7 +1408,7 @@ def last(x): @Substitution(name="groupby") @Appender(_common_see_also) - def ohlc(self): + def ohlc(self) -> DataFrame: """ Compute sum of values, excluding missing values. @@ -1815,7 +1820,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra return result - def quantile(self, q=0.5, interpolation="linear"): + def quantile(self, q=0.5, interpolation: str = "linear"): """ Return group values at the given quantile, a la numpy.percentile. @@ -1928,7 +1933,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return result.take(indices) @Substitution(name="groupby") - def ngroup(self, ascending=True): + def ngroup(self, ascending: bool = True): """ Number each group from 0 to the number of groups - 1. @@ -1997,7 +2002,7 @@ def ngroup(self, ascending=True): return result @Substitution(name="groupby") - def cumcount(self, ascending=True): + def cumcount(self, ascending: bool = True): """ Number each item in each group from 0 to the length of that group - 1. @@ -2058,7 +2063,12 @@ def cumcount(self, ascending=True): @Substitution(name="groupby") @Appender(_common_see_also) def rank( - self, method="average", ascending=True, na_option="keep", pct=False, axis=0 + self, + method: str = "average", + ascending: bool = True, + na_option: str = "keep", + pct: bool = False, + axis: int = 0, ): """ Provide the rank of values within each group. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 9bbe73c1851b5..2cc0e5fde2290 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -7,7 +7,7 @@ """ import collections -from typing import List, Optional +from typing import List, Optional, Type import numpy as np @@ -96,7 +96,7 @@ def __iter__(self): return iter(self.indices) @property - def nkeys(self): + def nkeys(self) -> int: return len(self.groupings) def get_iterator(self, data, axis=0): @@ -135,7 +135,7 @@ def _get_group_keys(self): # provide "flattened" iterator for multi-group setting return get_flattened_iterator(comp_ids, ngroups, self.levels, self.labels) - def apply(self, f, data, axis=0): + def apply(self, f, data, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() @@ -220,7 +220,7 @@ def levels(self): def names(self): return [ping.name for ping in self.groupings] - def size(self): + def size(self) -> Series: """ Compute group sizes @@ -244,7 +244,7 @@ def groups(self): return self.axis.groupby(to_groupby) @cache_readonly - def is_monotonic(self): + def is_monotonic(self) -> bool: # return if my group orderings are monotonic return Index(self.group_info[0]).is_monotonic @@ -275,7 +275,7 @@ def _get_compressed_labels(self): return ping.labels, np.arange(len(ping.group_index)) @cache_readonly - def ngroups(self): + def ngroups(self) -> int: return len(self.result_index) @property @@ -345,7 +345,7 @@ def _is_builtin_func(self, arg): """ return SelectionMixin._builtin_table.get(arg, arg) - def _get_cython_function(self, kind, how, values, is_numeric): + def _get_cython_function(self, kind: str, how: str, values, is_numeric: bool): dtype_str = values.dtype.name @@ -386,7 +386,9 @@ def get_func(fname): return func - def _cython_operation(self, kind: str, values, how, axis, min_count=-1, **kwargs): + def _cython_operation( + self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs + ): assert kind in ["transform", "aggregate"] orig_values = values @@ -530,16 +532,23 @@ def _cython_operation(self, kind: str, values, how, axis, min_count=-1, **kwargs return result, names - def aggregate(self, values, how, axis=0, min_count=-1): + def aggregate(self, values, how: str, axis: int = 0, min_count: int = -1): return self._cython_operation( "aggregate", values, how, axis, min_count=min_count ) - def transform(self, values, how, axis=0, **kwargs): + def transform(self, values, how: str, axis: int = 0, **kwargs): return self._cython_operation("transform", values, how, axis, **kwargs) def _aggregate( - self, result, counts, values, comp_ids, agg_func, is_datetimelike, min_count=-1 + self, + result, + counts, + values, + comp_ids, + agg_func, + is_datetimelike: bool, + min_count: int = -1, ): if values.ndim > 2: # punting for now @@ -554,7 +563,7 @@ def _aggregate( return result def _transform( - self, result, values, comp_ids, transform_func, is_datetimelike, **kwargs + self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs ): comp_ids, _, ngroups = self.group_info @@ -566,7 +575,7 @@ def _transform( return result - def agg_series(self, obj, func): + def agg_series(self, obj: Series, func): if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider @@ -684,7 +693,7 @@ def groups(self): return result @property - def nkeys(self): + def nkeys(self) -> int: return 1 def _get_grouper(self): @@ -771,7 +780,7 @@ def groupings(self): for lvl, name in zip(self.levels, self.names) ] - def agg_series(self, obj, func): + def agg_series(self, obj: Series, func): dummy = obj[:0] grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() @@ -863,10 +872,11 @@ def _chop(self, sdata, slice_obj: slice): return sdata._slice(slice_obj, axis=1) -def get_splitter(data, *args, **kwargs): +def get_splitter(data: NDFrame, *args, **kwargs): if isinstance(data, Series): - klass = SeriesSplitter - elif isinstance(data, DataFrame): + klass = SeriesSplitter # type: Type[DataSplitter] + else: + # i.e. DataFrame klass = FrameSplitter return klass(data, *args, **kwargs) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 36e1b06230d7e..4ba485c85d8ba 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -244,7 +244,7 @@ def concatenate_join_units(join_units, concat_axis, copy): # Concatenating join units along ax0 is handled in _merge_blocks. raise AssertionError("Concatenating join units along axis0") - empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) + empty_dtype, upcasted_na = _get_empty_dtype_and_na(join_units) to_concat = [ ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) @@ -268,7 +268,7 @@ def concatenate_join_units(join_units, concat_axis, copy): return concat_values -def get_empty_dtype_and_na(join_units): +def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. @@ -284,7 +284,7 @@ def get_empty_dtype_and_na(join_units): if blk is None: return np.float64, np.nan - if is_uniform_reindex(join_units): + if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value @@ -398,7 +398,7 @@ def is_uniform_join_units(join_units): ) -def is_uniform_reindex(join_units): +def _is_uniform_reindex(join_units) -> bool: return ( # TODO: should this be ju.block._can_hold_na? all(ju.block and ju.block.is_extension for ju in join_units) @@ -406,7 +406,7 @@ def is_uniform_reindex(join_units): ) -def trim_join_unit(join_unit, length): +def _trim_join_unit(join_unit, length): """ Reduce join_unit's shape along item axis to length. @@ -486,9 +486,9 @@ def _next_or_none(seq): for i, (plc, unit) in enumerate(next_items): yielded_units[i] = unit if len(plc) > min_len: - # trim_join_unit updates unit in place, so only + # _trim_join_unit updates unit in place, so only # placement needs to be sliced to skip min_len. - next_items[i] = (plc[min_len:], trim_join_unit(unit, min_len)) + next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len)) else: yielded_placement = plc next_items[i] = _next_or_none(plans[i]) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 39e00047ea968..772ac1cd93059 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -29,15 +29,15 @@ def concat( objs, axis=0, - join="outer", + join: str = "outer", join_axes=None, - ignore_index=False, + ignore_index: bool = False, keys=None, levels=None, names=None, - verify_integrity=False, + verify_integrity: bool = False, sort=None, - copy=True, + copy: bool = True, ): """ Concatenate pandas objects along a particular axis with optional set logic @@ -265,14 +265,14 @@ def __init__( self, objs, axis=0, - join="outer", + join: str = "outer", join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, - verify_integrity=False, - copy=True, + ignore_index: bool = False, + verify_integrity: bool = False, + copy: bool = True, sort=False, ): if isinstance(objs, (NDFrame, str)): @@ -324,8 +324,8 @@ def __init__( for obj in objs: if not isinstance(obj, (Series, DataFrame)): msg = ( - "cannot concatenate object of type '{}';" - " only Series and DataFrame objs are valid".format(type(obj)) + "cannot concatenate object of type '{typ}';" + " only Series and DataFrame objs are valid".format(typ=type(obj)) ) raise TypeError(msg) @@ -580,7 +580,7 @@ def _get_concat_axis(self): return concat_axis - def _maybe_check_integrity(self, concat_index): + def _maybe_check_integrity(self, concat_index: Index): if self.verify_integrity: if not concat_index.is_unique: overlap = concat_index[concat_index.duplicated()].unique() @@ -590,11 +590,11 @@ def _maybe_check_integrity(self, concat_index): ) -def _concat_indexes(indexes): +def _concat_indexes(indexes) -> Index: return indexes[0].append(indexes[1:]) -def _make_concat_multiindex(indexes, keys, levels=None, names=None): +def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: if (levels is None and isinstance(keys[0], tuple)) or ( levels is not None and len(levels) > 1 @@ -715,7 +715,6 @@ def _get_series_result_type(result, objs=None): """ # TODO: See if we can just inline with _constructor_expanddim # now that sparse is removed. - from pandas import DataFrame # concat Series with axis 1 if isinstance(result, dict): diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index c85050bc4232b..98fee491e0a73 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -188,7 +188,7 @@ def lreshape(data, groups, dropna=True, label=None): return data._constructor(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): +def wide_to_long(df, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): r""" Wide panel to long format. Less flexible but more user-friendly than melt. @@ -419,7 +419,7 @@ def get_var_names(df, stub, sep, suffix): pattern = re.compile(regex) return [col for col in df.columns if pattern.match(col)] - def melt_stub(df, stub, i, j, value_vars, sep): + def melt_stub(df, stub, i, j, value_vars, sep: str): newdf = melt( df, id_vars=i, @@ -456,8 +456,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] - melted = melted[0].join(melted[1:], how="outer") + _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] + melted = _melted[0].join(_melted[1:], how="outer") if len(i) == 1: new = df[id_vars].set_index(i).join(melted) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6ef13a62ee366..a189b2cd1ab84 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -10,7 +10,7 @@ import numpy as np -from pandas._libs import hashtable as libhashtable, lib +from pandas._libs import Timedelta, hashtable as libhashtable, lib import pandas._libs.join as libjoin from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -36,9 +36,10 @@ is_object_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.missing import isnull, na_value_for_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna, na_value_for_dtype -from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta +from pandas import Categorical, Index, MultiIndex import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com @@ -1204,7 +1205,7 @@ def _validate_specification(self): if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") - def _validate(self, validate): + def _validate(self, validate: str): # Check uniqueness of each if self.left_index: @@ -1300,7 +1301,12 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs) def _restore_dropped_levels_multijoin( - left, right, dropped_level_names, join_index, lindexer, rindexer + left: MultiIndex, + right: MultiIndex, + dropped_level_names, + join_index, + lindexer, + rindexer, ): """ *this is an internal non-public method* @@ -1338,7 +1344,7 @@ def _restore_dropped_levels_multijoin( """ - def _convert_to_mulitindex(index): + def _convert_to_mulitindex(index) -> MultiIndex: if isinstance(index, MultiIndex): return index else: @@ -1686,13 +1692,13 @@ def flip(xs): msg_missings = "Merge keys contain null values on {side} side" if not Index(left_values).is_monotonic: - if isnull(left_values).any(): + if isna(left_values).any(): raise ValueError(msg_missings.format(side="left")) else: raise ValueError(msg_sorted.format(side="left")) if not Index(right_values).is_monotonic: - if isnull(right_values).any(): + if isna(right_values).any(): raise ValueError(msg_missings.format(side="right")) else: raise ValueError(msg_sorted.format(side="right")) @@ -1959,9 +1965,9 @@ def _any(x) -> bool: def validate_operand(obj): - if isinstance(obj, DataFrame): + if isinstance(obj, ABCDataFrame): return obj - elif isinstance(obj, Series): + elif isinstance(obj, ABCSeries): if obj.name is None: raise ValueError("Cannot merge a Series without a name") else: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7537dd0ac2065..a8dcc995e48da 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -958,7 +958,7 @@ def _get_dummies_1d( if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") - def get_empty_frame(data): + def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 0f2920b3558c9..2ad5a1eb6faed 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -32,7 +32,7 @@ class _GroupByMixin(GroupByMixin): """ def __init__(self, obj, *args, **kwargs): - parent = kwargs.pop("parent", None) # noqa + kwargs.pop("parent", None) groupby = kwargs.pop("groupby", None) if groupby is None: groupby, obj = obj, obj.obj diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 68eb1f630bfc3..f6d27de132ad9 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1642,17 +1642,18 @@ def _get_corr(a, b): class Rolling(_Rolling_and_Expanding): @cache_readonly - def is_datetimelike(self): + def is_datetimelike(self) -> bool: return isinstance( self._on, (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) ) @cache_readonly - def _on(self): + def _on(self) -> Index: if self.on is None: if self.axis == 0: return self.obj.index - elif self.axis == 1: + else: + # i.e. self.axis == 1 return self.obj.columns elif isinstance(self.on, Index): return self.on @@ -1660,9 +1661,9 @@ def _on(self): return Index(self.obj[self.on]) else: raise ValueError( - "invalid on specified as {0}, " + "invalid on specified as {on}, " "must be a column (of DataFrame), an Index " - "or None".format(self.on) + "or None".format(on=self.on) ) def validate(self): @@ -1711,7 +1712,9 @@ def _validate_monotonic(self): formatted = self.on if self.on is None: formatted = "index" - raise ValueError("{0} must be monotonic".format(formatted)) + raise ValueError( + "{formatted} must be monotonic".format(formatted=formatted) + ) def _validate_freq(self): """ @@ -1723,9 +1726,9 @@ def _validate_freq(self): return to_offset(self.window) except (TypeError, ValueError): raise ValueError( - "passed window {0} is not " + "passed window {window} is not " "compatible with a datetimelike " - "index".format(self.window) + "index".format(window=self.window) ) _agg_see_also_doc = dedent( diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 1e3f5c1ed870e..f5e40e712642e 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -36,8 +36,6 @@ from pandas.core.dtypes.inference import is_list_like -from pandas.core.tools.datetimes import to_datetime - __all__ = [ "Day", "BusinessDay", @@ -2752,8 +2750,10 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): offset = to_offset(offset) - start = to_datetime(start) - end = to_datetime(end) + start = Timestamp(start) + start = start if start is not NaT else None + end = Timestamp(end) + end = end if end is not NaT else None if start and not offset.onOffset(start): start = offset.rollforward(start)